Importation et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation du fichier
import pandas
D = pandas.read_excel("bieres.xlsx")
D.head()
Out[ ]:
cost size alcohol reputat color aroma taste
0 10 15 20 85 40 30 50
1 100 70 50 30 75 60 80
2 65 30 35 80 80 60 90
3 0 0 20 30 80 90 100
4 10 25 10 100 50 40 60
In [ ]:
#statistiques descriptives
D.describe()
Out[ ]:
cost size alcohol reputat color aroma taste
count 99.000000 99.000000 99.000000 99.000000 99.000000 99.000000 99.000000
mean 27.777778 22.222222 23.888889 55.555556 63.888889 56.111111 80.555556
std 31.349106 20.256294 12.259015 25.891147 18.162469 19.789138 17.318872
min 0.000000 0.000000 10.000000 30.000000 40.000000 30.000000 50.000000
25% 10.000000 10.000000 15.000000 30.000000 50.000000 40.000000 65.000000
50% 15.000000 15.000000 20.000000 40.000000 60.000000 60.000000 85.000000
75% 25.000000 30.000000 30.000000 80.000000 80.000000 65.000000 95.000000
max 100.000000 70.000000 50.000000 100.000000 95.000000 90.000000 100.000000
In [ ]:
#nombre d'observations
n = D.shape[0]
print(n)

#nombre de variables
p = D.shape[1]
print(p)
99
7

ACP avec le package "statsmodels"¶

In [ ]:
#vérification du package - présent par défaut avec Anaconda
import statsmodels
statsmodels.__version__
Out[ ]:
'0.13.5'
In [ ]:
#classe de calcul
from statsmodels.multivariate.pca import PCA

#instanciation et calculs
# method = décomposition en valeurs singulières
acp = PCA(D,standardize=True,method="svd")

#valeurs propres
eig_val = acp.eigenvals/n
print(eig_val)
0    3.306082
1    2.719206
2    0.567371
3    0.193431
4    0.119954
5    0.076735
6    0.017222
Name: eigenvals, dtype: float64
In [ ]:
#screeplot
import numpy, matplotlib.pyplot as plt
plt.plot(numpy.arange(1,eig_val.shape[0]+1,1),eig_val,marker="+")
Out[ ]:
[<matplotlib.lines.Line2D at 0x259f2e77040>]
In [ ]:
#part de variance restituée
acp.rsquare
Out[ ]:
ncomp
0    0.000000
1    0.472297
2    0.860755
3    0.941808
4    0.969441
5    0.986578
6    0.997540
7    1.000000
Name: rsquare, dtype: float64

Analyse des variables¶

In [ ]:
#loadings
acp.loadings
Out[ ]:
comp_0 comp_1 comp_2 comp_3 comp_4 comp_5 comp_6
cost 0.277777 -0.495104 0.196444 -0.501708 0.053491 -0.115932 0.609188
size 0.124157 -0.576560 -0.019033 0.023074 0.345097 0.599013 -0.416366
alcohol 0.328726 -0.458963 -0.033555 0.422006 -0.136822 -0.638922 -0.274110
reputat -0.407584 -0.075074 0.873209 -0.016132 0.068973 -0.140076 -0.202747
color 0.498336 0.113813 0.365705 0.173596 -0.644671 0.398391 0.022730
aroma 0.432358 0.311423 0.245264 0.407941 0.650799 0.005772 0.256787
taste 0.444936 0.310209 0.059439 -0.610620 0.124217 -0.202859 -0.522333
In [ ]:
#correlations des var. avec les composantes
#loadings * racine carrée des val.p.
corr_fact = acp.loadings*pandas.Series(numpy.sqrt(eig_val.values),index=acp.loadings.columns)
print(corr_fact)
           comp_0    comp_1    comp_2    comp_3    comp_4    comp_5    comp_6
cost     0.505072 -0.816427  0.147969 -0.220655  0.018526 -0.032114  0.079946
size     0.225750 -0.950749 -0.014336  0.010148  0.119522  0.165933 -0.054641
alcohol  0.597710 -0.756830 -0.025275  0.185602 -0.047387 -0.176988 -0.035972
reputat -0.741096 -0.123797  0.657736 -0.007095  0.023888 -0.038802 -0.026607
color    0.906105  0.187678  0.275464  0.076349 -0.223277  0.110358  0.002983
aroma    0.786140  0.513537  0.184743  0.179416  0.225400  0.001599  0.033699
taste    0.809010  0.511535  0.044772 -0.268556  0.043022 -0.056194 -0.068547
In [ ]:
#récupérer les corrélations pour les 3 premières composantes
corr_3fact = corr_fact[['comp_0','comp_1','comp_2']]

#sous-forme de heatmap - 3 premiers facteurs
import seaborn as sns
sns.heatmap(corr_3fact,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),linewidths=0.5,annot=True)
Out[ ]:
<Axes: >

Rotation de facteurs - Varimax¶

In [ ]:
#classe de calcul
from statsmodels.multivariate.factor_rotation import rotate_factors

#rotation varimax -> travail sur les contributions
L, T = rotate_factors(corr_3fact.values,method='varimax')

#corrélation après rotation
corrRotate = L
print(corrRotate)
[[ 0.11565405 -0.96445206 -0.00133856]
 [-0.23346253 -0.94882129 -0.01804583]
 [ 0.14321045 -0.93235262 -0.20223038]
 [-0.37252761  0.13644959  0.91638973]
 [ 0.92129565 -0.24721992 -0.14908654]
 [ 0.92731451  0.10617681 -0.21137868]
 [ 0.88640491  0.10926312 -0.34715758]]
In [ ]:
#sous-forme de pandas dataframe
dfCorrRotate = pandas.DataFrame(corrRotate,index=corr_3fact.index,columns=corr_3fact.columns)
print(dfCorrRotate)
           comp_0    comp_1    comp_2
cost     0.115654 -0.964452 -0.001339
size    -0.233463 -0.948821 -0.018046
alcohol  0.143210 -0.932353 -0.202230
reputat -0.372528  0.136450  0.916390
color    0.921296 -0.247220 -0.149087
aroma    0.927315  0.106177 -0.211379
taste    0.886405  0.109263 -0.347158
In [ ]:
#part de variance expliquée sur les 3 facteurs
var_restit = numpy.sum(dfCorrRotate.values**2,axis=0)
print(var_restit)
[2.72157822 2.80265916 1.06842083]
In [ ]:
#la somme totale préservée
numpy.sum(var_restit)/p
Out[ ]:
0.9418083157346617
In [ ]:
#heatmap again
sns.heatmap(dfCorrRotate,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),linewidths=0.5,annot=True)
Out[ ]:
<Axes: >
In [ ]:
#T étant la matrice de rotation
#on peut l'appliquer sur les variables
numpy.dot(corr_3fact.values,T)
Out[ ]:
array([[ 0.11565405, -0.96445206, -0.00133856],
       [-0.23346253, -0.94882129, -0.01804583],
       [ 0.14321045, -0.93235262, -0.20223038],
       [-0.37252761,  0.13644959,  0.91638973],
       [ 0.92129565, -0.24721992, -0.14908654],
       [ 0.92731451,  0.10617681, -0.21137868],
       [ 0.88640491,  0.10926312, -0.34715758]])
In [ ]:
#on pourrait aussi l'appliquer sur les coordonnées factorielles
#des individus pour obtenir leurs positions dans le nouveau repère