#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation du fichier
import pandas
D = pandas.read_excel("bieres.xlsx")
D.head()
cost | size | alcohol | reputat | color | aroma | taste | |
---|---|---|---|---|---|---|---|
0 | 10 | 15 | 20 | 85 | 40 | 30 | 50 |
1 | 100 | 70 | 50 | 30 | 75 | 60 | 80 |
2 | 65 | 30 | 35 | 80 | 80 | 60 | 90 |
3 | 0 | 0 | 20 | 30 | 80 | 90 | 100 |
4 | 10 | 25 | 10 | 100 | 50 | 40 | 60 |
#statistiques descriptives
D.describe()
cost | size | alcohol | reputat | color | aroma | taste | |
---|---|---|---|---|---|---|---|
count | 99.000000 | 99.000000 | 99.000000 | 99.000000 | 99.000000 | 99.000000 | 99.000000 |
mean | 27.777778 | 22.222222 | 23.888889 | 55.555556 | 63.888889 | 56.111111 | 80.555556 |
std | 31.349106 | 20.256294 | 12.259015 | 25.891147 | 18.162469 | 19.789138 | 17.318872 |
min | 0.000000 | 0.000000 | 10.000000 | 30.000000 | 40.000000 | 30.000000 | 50.000000 |
25% | 10.000000 | 10.000000 | 15.000000 | 30.000000 | 50.000000 | 40.000000 | 65.000000 |
50% | 15.000000 | 15.000000 | 20.000000 | 40.000000 | 60.000000 | 60.000000 | 85.000000 |
75% | 25.000000 | 30.000000 | 30.000000 | 80.000000 | 80.000000 | 65.000000 | 95.000000 |
max | 100.000000 | 70.000000 | 50.000000 | 100.000000 | 95.000000 | 90.000000 | 100.000000 |
#nombre d'observations
n = D.shape[0]
print(n)
#nombre de variables
p = D.shape[1]
print(p)
99 7
#vérification du package - présent par défaut avec Anaconda
import statsmodels
statsmodels.__version__
'0.13.5'
#classe de calcul
from statsmodels.multivariate.pca import PCA
#instanciation et calculs
# method = décomposition en valeurs singulières
acp = PCA(D,standardize=True,method="svd")
#valeurs propres
eig_val = acp.eigenvals/n
print(eig_val)
0 3.306082 1 2.719206 2 0.567371 3 0.193431 4 0.119954 5 0.076735 6 0.017222 Name: eigenvals, dtype: float64
#screeplot
import numpy, matplotlib.pyplot as plt
plt.plot(numpy.arange(1,eig_val.shape[0]+1,1),eig_val,marker="+")
[<matplotlib.lines.Line2D at 0x259f2e77040>]
#part de variance restituée
acp.rsquare
ncomp 0 0.000000 1 0.472297 2 0.860755 3 0.941808 4 0.969441 5 0.986578 6 0.997540 7 1.000000 Name: rsquare, dtype: float64
#loadings
acp.loadings
comp_0 | comp_1 | comp_2 | comp_3 | comp_4 | comp_5 | comp_6 | |
---|---|---|---|---|---|---|---|
cost | 0.277777 | -0.495104 | 0.196444 | -0.501708 | 0.053491 | -0.115932 | 0.609188 |
size | 0.124157 | -0.576560 | -0.019033 | 0.023074 | 0.345097 | 0.599013 | -0.416366 |
alcohol | 0.328726 | -0.458963 | -0.033555 | 0.422006 | -0.136822 | -0.638922 | -0.274110 |
reputat | -0.407584 | -0.075074 | 0.873209 | -0.016132 | 0.068973 | -0.140076 | -0.202747 |
color | 0.498336 | 0.113813 | 0.365705 | 0.173596 | -0.644671 | 0.398391 | 0.022730 |
aroma | 0.432358 | 0.311423 | 0.245264 | 0.407941 | 0.650799 | 0.005772 | 0.256787 |
taste | 0.444936 | 0.310209 | 0.059439 | -0.610620 | 0.124217 | -0.202859 | -0.522333 |
#correlations des var. avec les composantes
#loadings * racine carrée des val.p.
corr_fact = acp.loadings*pandas.Series(numpy.sqrt(eig_val.values),index=acp.loadings.columns)
print(corr_fact)
comp_0 comp_1 comp_2 comp_3 comp_4 comp_5 comp_6 cost 0.505072 -0.816427 0.147969 -0.220655 0.018526 -0.032114 0.079946 size 0.225750 -0.950749 -0.014336 0.010148 0.119522 0.165933 -0.054641 alcohol 0.597710 -0.756830 -0.025275 0.185602 -0.047387 -0.176988 -0.035972 reputat -0.741096 -0.123797 0.657736 -0.007095 0.023888 -0.038802 -0.026607 color 0.906105 0.187678 0.275464 0.076349 -0.223277 0.110358 0.002983 aroma 0.786140 0.513537 0.184743 0.179416 0.225400 0.001599 0.033699 taste 0.809010 0.511535 0.044772 -0.268556 0.043022 -0.056194 -0.068547
#récupérer les corrélations pour les 3 premières composantes
corr_3fact = corr_fact[['comp_0','comp_1','comp_2']]
#sous-forme de heatmap - 3 premiers facteurs
import seaborn as sns
sns.heatmap(corr_3fact,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),linewidths=0.5,annot=True)
<Axes: >
#classe de calcul
from statsmodels.multivariate.factor_rotation import rotate_factors
#rotation varimax -> travail sur les contributions
L, T = rotate_factors(corr_3fact.values,method='varimax')
#corrélation après rotation
corrRotate = L
print(corrRotate)
[[ 0.11565405 -0.96445206 -0.00133856] [-0.23346253 -0.94882129 -0.01804583] [ 0.14321045 -0.93235262 -0.20223038] [-0.37252761 0.13644959 0.91638973] [ 0.92129565 -0.24721992 -0.14908654] [ 0.92731451 0.10617681 -0.21137868] [ 0.88640491 0.10926312 -0.34715758]]
#sous-forme de pandas dataframe
dfCorrRotate = pandas.DataFrame(corrRotate,index=corr_3fact.index,columns=corr_3fact.columns)
print(dfCorrRotate)
comp_0 comp_1 comp_2 cost 0.115654 -0.964452 -0.001339 size -0.233463 -0.948821 -0.018046 alcohol 0.143210 -0.932353 -0.202230 reputat -0.372528 0.136450 0.916390 color 0.921296 -0.247220 -0.149087 aroma 0.927315 0.106177 -0.211379 taste 0.886405 0.109263 -0.347158
#part de variance expliquée sur les 3 facteurs
var_restit = numpy.sum(dfCorrRotate.values**2,axis=0)
print(var_restit)
[2.72157822 2.80265916 1.06842083]
#la somme totale préservée
numpy.sum(var_restit)/p
0.9418083157346617
#heatmap again
sns.heatmap(dfCorrRotate,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),linewidths=0.5,annot=True)
<Axes: >
#T étant la matrice de rotation
#on peut l'appliquer sur les variables
numpy.dot(corr_3fact.values,T)
array([[ 0.11565405, -0.96445206, -0.00133856], [-0.23346253, -0.94882129, -0.01804583], [ 0.14321045, -0.93235262, -0.20223038], [-0.37252761, 0.13644959, 0.91638973], [ 0.92129565, -0.24721992, -0.14908654], [ 0.92731451, 0.10617681, -0.21137868], [ 0.88640491, 0.10926312, -0.34715758]])
#on pourrait aussi l'appliquer sur les coordonnées factorielles
#des individus pour obtenir leurs positions dans le nouveau repère