Importation et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement du fichier - Dectahlon du package FactoMineR
import pandas
deca = pandas.read_excel("decathlon_olympic.xlsx",index_col=0)
deca.describe()
Out[ ]:
100m Long_jump Shot_put High_jump 400m 110m_hurdle Discus Pole_vault Javeline 1500m
count 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000
mean 10.915714 7.265714 14.625000 1.976429 49.610000 14.553571 44.375714 4.732143 58.948929 277.550714
std 0.231027 0.341136 0.856221 0.089947 1.268455 0.442696 3.299586 0.289384 4.975849 11.321826
min 10.440000 6.610000 13.070000 1.850000 46.810000 13.970000 39.830000 4.200000 50.620000 263.080000
25% 10.837500 7.020000 13.977500 1.932500 48.930000 14.200000 42.012500 4.500000 55.360000 270.690000
50% 10.905000 7.280000 14.785000 1.940000 49.370000 14.400000 44.505000 4.700000 58.940000 276.320000
75% 11.080000 7.482500 15.170000 2.037500 50.360000 14.952500 45.732500 4.925000 61.000000 280.432500
max 11.360000 7.960000 16.360000 2.150000 53.200000 15.390000 51.650000 5.400000 70.520000 317.000000

ACP et rotation Quartimax¶

ACP - Choix du nombre de facteurs¶

In [ ]:
#classe de calcul
from statsmodels.multivariate.pca import PCA

#instanciation et calculs
# method = décomposition en valeurs singulières
acp = PCA(deca,standardize=True,method="svd")

#valeurs propres
eig_val = acp.eigenvals/deca.shape[0]
print(eig_val)
0    3.544657
1    1.969956
2    1.421725
3    0.903491
4    0.563632
5    0.528227
6    0.432861
7    0.365810
8    0.163496
9    0.106145
Name: eigenvals, dtype: float64
In [ ]:
#screeplot
import numpy, matplotlib.pyplot as plt
plt.plot(numpy.arange(1,eig_val.shape[0]+1,1),eig_val,marker="+")
Out[ ]:
[<matplotlib.lines.Line2D at 0x1e91dd45100>]
In [ ]:
#part de variance restituée
acp.rsquare
Out[ ]:
ncomp
0     0.000000
1     0.354466
2     0.551461
3     0.693634
4     0.783983
5     0.840346
6     0.893169
7     0.936455
8     0.973036
9     0.989386
10    1.000000
Name: rsquare, dtype: float64
In [ ]:
#corrélations des variables avec les facteurs
#correlations des var. avec les composantes
#loadings * racine carrée des val.p.
all_corr_fact = acp.loadings*pandas.Series(numpy.sqrt(eig_val.values),index=acp.loadings.columns)
print(all_corr_fact)
               comp_0    comp_1    comp_2    comp_3    comp_4    comp_5  \
100m         0.795838 -0.253599  0.252770 -0.071298  0.201126 -0.041150   
Long_jump   -0.793506  0.324979 -0.155214  0.005841 -0.133206  0.005747   
Shot_put    -0.628969 -0.622801 -0.022525 -0.133494 -0.033523  0.159002   
High_jump   -0.625992 -0.471948  0.012916  0.105516  0.502654  0.253755   
400m         0.734180 -0.494657 -0.229726 -0.111158  0.178706 -0.164643   
110m_hurdle  0.708927 -0.232408  0.043929 -0.109666 -0.315807  0.548085   
Discus      -0.542104 -0.667282 -0.017855  0.196003 -0.257758 -0.059894   
Pole_vault  -0.179599  0.326306 -0.624477 -0.611345  0.129133  0.143375   
Javeline    -0.286421 -0.338982  0.521087 -0.655676 -0.122550 -0.218434   
1500m        0.210744 -0.473357 -0.785201  0.054466 -0.147813 -0.193563   

               comp_6    comp_7    comp_8    comp_9  
100m         0.034477  0.385210 -0.203958 -0.005757  
Long_jump    0.421351  0.083450 -0.145646  0.130848  
Shot_put    -0.336113 -0.111026 -0.183328  0.113474  
High_jump    0.213941 -0.044270  0.039265 -0.088171  
400m         0.147774 -0.125267  0.113691  0.204125  
110m_hurdle  0.165011 -0.040961  0.016679 -0.011743  
Discus      -0.015241  0.346662  0.177344  0.015395  
Pole_vault  -0.109775  0.205390  0.078020  0.000165  
Javeline     0.156172 -0.092801  0.012682 -0.075610  
1500m        0.098260 -0.088493 -0.120958 -0.143461  
In [ ]:
#récupérer les corrélations pour les 4 premières composantes
corr_fact = all_corr_fact[['comp_0','comp_1','comp_2','comp_3']]
corr_fact
Out[ ]:
comp_0 comp_1 comp_2 comp_3
100m 0.795838 -0.253599 0.252770 -0.071298
Long_jump -0.793506 0.324979 -0.155214 0.005841
Shot_put -0.628969 -0.622801 -0.022525 -0.133494
High_jump -0.625992 -0.471948 0.012916 0.105516
400m 0.734180 -0.494657 -0.229726 -0.111158
110m_hurdle 0.708927 -0.232408 0.043929 -0.109666
Discus -0.542104 -0.667282 -0.017855 0.196003
Pole_vault -0.179599 0.326306 -0.624477 -0.611345
Javeline -0.286421 -0.338982 0.521087 -0.655676
1500m 0.210744 -0.473357 -0.785201 0.054466

Rotation Quartimax¶

In [ ]:
#classe de calcul
from statsmodels.multivariate.factor_rotation import rotate_factors

#rotation varimax -> travail sur les contributions
L, T = rotate_factors(corr_fact.values,method='quartimax')

#corrélation après rotation - sous-forme de pandas dataframe
dfCorrRotate = pandas.DataFrame(L,index=corr_fact.index,columns=corr_fact.columns)
print(dfCorrRotate)
               comp_0    comp_1    comp_2    comp_3
100m         0.788156  0.270765  0.235868 -0.128491
Long_jump   -0.821676 -0.189475 -0.218514  0.024234
Shot_put    -0.156277 -0.843979 -0.077689 -0.243005
High_jump   -0.273136 -0.733029  0.089119 -0.077610
400m         0.904342 -0.035956 -0.113485  0.129728
110m_hurdle  0.723381  0.210079  0.048835 -0.027350
Discus      -0.108667 -0.860408  0.160527 -0.000456
Pole_vault  -0.199193  0.119546 -0.921007 -0.014929
Javeline     0.013137 -0.292546 -0.043879 -0.900393
1500m        0.471927 -0.407095 -0.451166  0.544054
In [ ]:
#cosinus2
dfCos2 = dfCorrRotate**2
dfCos2
Out[ ]:
comp_0 comp_1 comp_2 comp_3
100m 0.621190 0.073314 0.055634 1.650987e-02
Long_jump 0.675152 0.035901 0.047748 5.872730e-04
Shot_put 0.024423 0.712301 0.006036 5.905163e-02
High_jump 0.074604 0.537332 0.007942 6.023261e-03
400m 0.817834 0.001293 0.012879 1.682935e-02
110m_hurdle 0.523281 0.044133 0.002385 7.480050e-04
Discus 0.011808 0.740301 0.025769 2.079460e-07
Pole_vault 0.039678 0.014291 0.848254 2.228843e-04
Javeline 0.000173 0.085583 0.001925 8.107073e-01
1500m 0.222715 0.165727 0.203551 2.959946e-01
In [ ]:
#composante de rattachement de chaque variable
idComp = dfCos2.idxmax(axis=1)
idComp
Out[ ]:
100m           comp_0
Long_jump      comp_0
Shot_put       comp_1
High_jump      comp_1
400m           comp_0
110m_hurdle    comp_0
Discus         comp_1
Pole_vault     comp_2
Javeline       comp_3
1500m          comp_3
dtype: object
In [ ]:
#trier les variables dans l'ordre des composantes
#qui leur sont associées
idSorted = idComp.sort_values()
idSorted
Out[ ]:
100m           comp_0
Long_jump      comp_0
400m           comp_0
110m_hurdle    comp_0
Shot_put       comp_1
High_jump      comp_1
Discus         comp_1
Pole_vault     comp_2
Javeline       comp_3
1500m          comp_3
dtype: object
In [ ]:
#liste des variables selon le tri par blocs
lstvar = []

#trier les variables selon les corrélations
#à l'intérieur des composantes de rattachement
for comp in idSorted.unique():
    vcorr = dfCorrRotate.loc[idSorted==comp,comp]
    svcorr = vcorr.sort_values()
    lstvar.extend(svcorr.index)

#affichage
print(lstvar)
['Long_jump', '110m_hurdle', '100m', '400m', 'Discus', 'Shot_put', 'High_jump', 'Pole_vault', 'Javeline', '1500m']
In [ ]:
#réordonner les variables selon les rattachements
#dans le data frame des corrélations var x composantes
dfSortedRotate = dfCorrRotate.loc[lstvar]
dfSortedRotate
Out[ ]:
comp_0 comp_1 comp_2 comp_3
Long_jump -0.821676 -0.189475 -0.218514 0.024234
110m_hurdle 0.723381 0.210079 0.048835 -0.027350
100m 0.788156 0.270765 0.235868 -0.128491
400m 0.904342 -0.035956 -0.113485 0.129728
Discus -0.108667 -0.860408 0.160527 -0.000456
Shot_put -0.156277 -0.843979 -0.077689 -0.243005
High_jump -0.273136 -0.733029 0.089119 -0.077610
Pole_vault -0.199193 0.119546 -0.921007 -0.014929
Javeline 0.013137 -0.292546 -0.043879 -0.900393
1500m 0.471927 -0.407095 -0.451166 0.544054
In [ ]:
#heatmap
import seaborn as sns
sns.heatmap(dfSortedRotate,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),annot=True,linewidths=0.1)
Out[ ]:
<Axes: >
In [ ]:
#autre présentation -- corrélations par paires des variables
#triées dans l'ordre défini ci-dessus, pour mieux situer les "blocs"
sns.heatmap(deca[lstvar].corr(),vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(120,350,as_cmap=True),linewidths=0.1)
Out[ ]:
<Axes: >

Clustering de variables¶

In [ ]:
#matrice des distances
dist_var = numpy.sqrt(1-deca.corr()**2)
print(dist_var)
                 100m  Long_jump  Shot_put  High_jump      400m  110m_hurdle  \
100m         0.000000   0.709237  0.929153   0.950970  0.772689     0.840018   
Long_jump    0.709237   0.000000  0.980713   0.938359  0.741357     0.842841   
Shot_put     0.929153   0.980713  0.000000   0.790407  0.979938     0.969505   
High_jump    0.950970   0.938359  0.790407   0.000000  0.985589     0.945384   
400m         0.772689   0.741357  0.979938   0.985589  0.000000     0.854239   
110m_hurdle  0.840018   0.842841  0.969505   0.945384  0.854239     0.000000   
Discus       0.972401   0.968269  0.746153   0.856008  0.989551     0.976200   
Pole_vault   0.965487   0.958504  0.999719   0.999101  0.993316     0.988538   
Javeline     0.999932   0.995592  0.923606   0.978870  0.998505     0.996809   
1500m        0.998291   0.989073  0.991574   0.999994  0.834357     0.983841   

               Discus  Pole_vault  Javeline     1500m  
100m         0.972401    0.965487  0.999932  0.998291  
Long_jump    0.968269    0.958504  0.995592  0.989073  
Shot_put     0.746153    0.999719  0.923606  0.991574  
High_jump    0.856008    0.999101  0.978870  0.999994  
400m         0.989551    0.993316  0.998505  0.834357  
110m_hurdle  0.976200    0.988538  0.996809  0.983841  
Discus       0.000000    0.982887  0.966980  0.975461  
Pole_vault   0.982887    0.000000  0.997813  0.983763  
Javeline     0.966980    0.997813  0.000000  0.967856  
1500m        0.975461    0.983763  0.967856  0.000000  
In [ ]:
#préparation pour la CAH de scipy
#vectoriser la matrice des distances
from scipy.spatial.distance import squareform
VD = squareform(dist_var)
print(VD)
[0.70923746 0.9291529  0.95096979 0.77268866 0.84001839 0.97240075
 0.96548678 0.99993167 0.99829125 0.98071342 0.93835906 0.74135674
 0.8428406  0.96826927 0.95850391 0.99559181 0.98907308 0.79040681
 0.97993848 0.96950542 0.74615315 0.9997193  0.92360602 0.9915737
 0.98558877 0.94538436 0.85600757 0.99910148 0.97887048 0.99999389
 0.85423903 0.98955056 0.99331582 0.99850536 0.8343575  0.97620001
 0.98853761 0.99680932 0.98384082 0.98288663 0.96697968 0.97546116
 0.99781269 0.98376318 0.96785583]
In [ ]:
#CAH - Ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)
print(cah)
[[ 0.          1.          0.70923746  2.        ]
 [ 2.          6.          0.74615315  2.        ]
 [ 4.         10.          0.77250618  3.        ]
 [ 3.         11.          0.84818222  3.        ]
 [ 5.         12.          0.89326471  4.        ]
 [ 8.          9.          0.96785583  2.        ]
 [ 7.         15.          0.99834787  3.        ]
 [13.         16.          1.12994439  6.        ]
 [14.         17.          1.26176007 10.        ]]
In [ ]:
#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=deca.columns,orientation='left',color_threshold=0)
plt.show()