#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement du fichier - Dectahlon du package FactoMineR
import pandas
deca = pandas.read_excel("decathlon_olympic.xlsx",index_col=0)
deca.describe()
100m | Long_jump | Shot_put | High_jump | 400m | 110m_hurdle | Discus | Pole_vault | Javeline | 1500m | |
---|---|---|---|---|---|---|---|---|---|---|
count | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 | 28.000000 |
mean | 10.915714 | 7.265714 | 14.625000 | 1.976429 | 49.610000 | 14.553571 | 44.375714 | 4.732143 | 58.948929 | 277.550714 |
std | 0.231027 | 0.341136 | 0.856221 | 0.089947 | 1.268455 | 0.442696 | 3.299586 | 0.289384 | 4.975849 | 11.321826 |
min | 10.440000 | 6.610000 | 13.070000 | 1.850000 | 46.810000 | 13.970000 | 39.830000 | 4.200000 | 50.620000 | 263.080000 |
25% | 10.837500 | 7.020000 | 13.977500 | 1.932500 | 48.930000 | 14.200000 | 42.012500 | 4.500000 | 55.360000 | 270.690000 |
50% | 10.905000 | 7.280000 | 14.785000 | 1.940000 | 49.370000 | 14.400000 | 44.505000 | 4.700000 | 58.940000 | 276.320000 |
75% | 11.080000 | 7.482500 | 15.170000 | 2.037500 | 50.360000 | 14.952500 | 45.732500 | 4.925000 | 61.000000 | 280.432500 |
max | 11.360000 | 7.960000 | 16.360000 | 2.150000 | 53.200000 | 15.390000 | 51.650000 | 5.400000 | 70.520000 | 317.000000 |
#classe de calcul
from statsmodels.multivariate.pca import PCA
#instanciation et calculs
# method = décomposition en valeurs singulières
acp = PCA(deca,standardize=True,method="svd")
#valeurs propres
eig_val = acp.eigenvals/deca.shape[0]
print(eig_val)
0 3.544657 1 1.969956 2 1.421725 3 0.903491 4 0.563632 5 0.528227 6 0.432861 7 0.365810 8 0.163496 9 0.106145 Name: eigenvals, dtype: float64
#screeplot
import numpy, matplotlib.pyplot as plt
plt.plot(numpy.arange(1,eig_val.shape[0]+1,1),eig_val,marker="+")
[<matplotlib.lines.Line2D at 0x1e91dd45100>]
#part de variance restituée
acp.rsquare
ncomp 0 0.000000 1 0.354466 2 0.551461 3 0.693634 4 0.783983 5 0.840346 6 0.893169 7 0.936455 8 0.973036 9 0.989386 10 1.000000 Name: rsquare, dtype: float64
#corrélations des variables avec les facteurs
#correlations des var. avec les composantes
#loadings * racine carrée des val.p.
all_corr_fact = acp.loadings*pandas.Series(numpy.sqrt(eig_val.values),index=acp.loadings.columns)
print(all_corr_fact)
comp_0 comp_1 comp_2 comp_3 comp_4 comp_5 \ 100m 0.795838 -0.253599 0.252770 -0.071298 0.201126 -0.041150 Long_jump -0.793506 0.324979 -0.155214 0.005841 -0.133206 0.005747 Shot_put -0.628969 -0.622801 -0.022525 -0.133494 -0.033523 0.159002 High_jump -0.625992 -0.471948 0.012916 0.105516 0.502654 0.253755 400m 0.734180 -0.494657 -0.229726 -0.111158 0.178706 -0.164643 110m_hurdle 0.708927 -0.232408 0.043929 -0.109666 -0.315807 0.548085 Discus -0.542104 -0.667282 -0.017855 0.196003 -0.257758 -0.059894 Pole_vault -0.179599 0.326306 -0.624477 -0.611345 0.129133 0.143375 Javeline -0.286421 -0.338982 0.521087 -0.655676 -0.122550 -0.218434 1500m 0.210744 -0.473357 -0.785201 0.054466 -0.147813 -0.193563 comp_6 comp_7 comp_8 comp_9 100m 0.034477 0.385210 -0.203958 -0.005757 Long_jump 0.421351 0.083450 -0.145646 0.130848 Shot_put -0.336113 -0.111026 -0.183328 0.113474 High_jump 0.213941 -0.044270 0.039265 -0.088171 400m 0.147774 -0.125267 0.113691 0.204125 110m_hurdle 0.165011 -0.040961 0.016679 -0.011743 Discus -0.015241 0.346662 0.177344 0.015395 Pole_vault -0.109775 0.205390 0.078020 0.000165 Javeline 0.156172 -0.092801 0.012682 -0.075610 1500m 0.098260 -0.088493 -0.120958 -0.143461
#récupérer les corrélations pour les 4 premières composantes
corr_fact = all_corr_fact[['comp_0','comp_1','comp_2','comp_3']]
corr_fact
comp_0 | comp_1 | comp_2 | comp_3 | |
---|---|---|---|---|
100m | 0.795838 | -0.253599 | 0.252770 | -0.071298 |
Long_jump | -0.793506 | 0.324979 | -0.155214 | 0.005841 |
Shot_put | -0.628969 | -0.622801 | -0.022525 | -0.133494 |
High_jump | -0.625992 | -0.471948 | 0.012916 | 0.105516 |
400m | 0.734180 | -0.494657 | -0.229726 | -0.111158 |
110m_hurdle | 0.708927 | -0.232408 | 0.043929 | -0.109666 |
Discus | -0.542104 | -0.667282 | -0.017855 | 0.196003 |
Pole_vault | -0.179599 | 0.326306 | -0.624477 | -0.611345 |
Javeline | -0.286421 | -0.338982 | 0.521087 | -0.655676 |
1500m | 0.210744 | -0.473357 | -0.785201 | 0.054466 |
#classe de calcul
from statsmodels.multivariate.factor_rotation import rotate_factors
#rotation varimax -> travail sur les contributions
L, T = rotate_factors(corr_fact.values,method='quartimax')
#corrélation après rotation - sous-forme de pandas dataframe
dfCorrRotate = pandas.DataFrame(L,index=corr_fact.index,columns=corr_fact.columns)
print(dfCorrRotate)
comp_0 comp_1 comp_2 comp_3 100m 0.788156 0.270765 0.235868 -0.128491 Long_jump -0.821676 -0.189475 -0.218514 0.024234 Shot_put -0.156277 -0.843979 -0.077689 -0.243005 High_jump -0.273136 -0.733029 0.089119 -0.077610 400m 0.904342 -0.035956 -0.113485 0.129728 110m_hurdle 0.723381 0.210079 0.048835 -0.027350 Discus -0.108667 -0.860408 0.160527 -0.000456 Pole_vault -0.199193 0.119546 -0.921007 -0.014929 Javeline 0.013137 -0.292546 -0.043879 -0.900393 1500m 0.471927 -0.407095 -0.451166 0.544054
#cosinus2
dfCos2 = dfCorrRotate**2
dfCos2
comp_0 | comp_1 | comp_2 | comp_3 | |
---|---|---|---|---|
100m | 0.621190 | 0.073314 | 0.055634 | 1.650987e-02 |
Long_jump | 0.675152 | 0.035901 | 0.047748 | 5.872730e-04 |
Shot_put | 0.024423 | 0.712301 | 0.006036 | 5.905163e-02 |
High_jump | 0.074604 | 0.537332 | 0.007942 | 6.023261e-03 |
400m | 0.817834 | 0.001293 | 0.012879 | 1.682935e-02 |
110m_hurdle | 0.523281 | 0.044133 | 0.002385 | 7.480050e-04 |
Discus | 0.011808 | 0.740301 | 0.025769 | 2.079460e-07 |
Pole_vault | 0.039678 | 0.014291 | 0.848254 | 2.228843e-04 |
Javeline | 0.000173 | 0.085583 | 0.001925 | 8.107073e-01 |
1500m | 0.222715 | 0.165727 | 0.203551 | 2.959946e-01 |
#composante de rattachement de chaque variable
idComp = dfCos2.idxmax(axis=1)
idComp
100m comp_0 Long_jump comp_0 Shot_put comp_1 High_jump comp_1 400m comp_0 110m_hurdle comp_0 Discus comp_1 Pole_vault comp_2 Javeline comp_3 1500m comp_3 dtype: object
#trier les variables dans l'ordre des composantes
#qui leur sont associées
idSorted = idComp.sort_values()
idSorted
100m comp_0 Long_jump comp_0 400m comp_0 110m_hurdle comp_0 Shot_put comp_1 High_jump comp_1 Discus comp_1 Pole_vault comp_2 Javeline comp_3 1500m comp_3 dtype: object
#liste des variables selon le tri par blocs
lstvar = []
#trier les variables selon les corrélations
#à l'intérieur des composantes de rattachement
for comp in idSorted.unique():
vcorr = dfCorrRotate.loc[idSorted==comp,comp]
svcorr = vcorr.sort_values()
lstvar.extend(svcorr.index)
#affichage
print(lstvar)
['Long_jump', '110m_hurdle', '100m', '400m', 'Discus', 'Shot_put', 'High_jump', 'Pole_vault', 'Javeline', '1500m']
#réordonner les variables selon les rattachements
#dans le data frame des corrélations var x composantes
dfSortedRotate = dfCorrRotate.loc[lstvar]
dfSortedRotate
comp_0 | comp_1 | comp_2 | comp_3 | |
---|---|---|---|---|
Long_jump | -0.821676 | -0.189475 | -0.218514 | 0.024234 |
110m_hurdle | 0.723381 | 0.210079 | 0.048835 | -0.027350 |
100m | 0.788156 | 0.270765 | 0.235868 | -0.128491 |
400m | 0.904342 | -0.035956 | -0.113485 | 0.129728 |
Discus | -0.108667 | -0.860408 | 0.160527 | -0.000456 |
Shot_put | -0.156277 | -0.843979 | -0.077689 | -0.243005 |
High_jump | -0.273136 | -0.733029 | 0.089119 | -0.077610 |
Pole_vault | -0.199193 | 0.119546 | -0.921007 | -0.014929 |
Javeline | 0.013137 | -0.292546 | -0.043879 | -0.900393 |
1500m | 0.471927 | -0.407095 | -0.451166 | 0.544054 |
#heatmap
import seaborn as sns
sns.heatmap(dfSortedRotate,vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(20,250,as_cmap=True),annot=True,linewidths=0.1)
<Axes: >
#autre présentation -- corrélations par paires des variables
#triées dans l'ordre défini ci-dessus, pour mieux situer les "blocs"
sns.heatmap(deca[lstvar].corr(),vmin=-1,vmax=1.0,center=0,cmap=sns.diverging_palette(120,350,as_cmap=True),linewidths=0.1)
<Axes: >
#matrice des distances
dist_var = numpy.sqrt(1-deca.corr()**2)
print(dist_var)
100m Long_jump Shot_put High_jump 400m 110m_hurdle \ 100m 0.000000 0.709237 0.929153 0.950970 0.772689 0.840018 Long_jump 0.709237 0.000000 0.980713 0.938359 0.741357 0.842841 Shot_put 0.929153 0.980713 0.000000 0.790407 0.979938 0.969505 High_jump 0.950970 0.938359 0.790407 0.000000 0.985589 0.945384 400m 0.772689 0.741357 0.979938 0.985589 0.000000 0.854239 110m_hurdle 0.840018 0.842841 0.969505 0.945384 0.854239 0.000000 Discus 0.972401 0.968269 0.746153 0.856008 0.989551 0.976200 Pole_vault 0.965487 0.958504 0.999719 0.999101 0.993316 0.988538 Javeline 0.999932 0.995592 0.923606 0.978870 0.998505 0.996809 1500m 0.998291 0.989073 0.991574 0.999994 0.834357 0.983841 Discus Pole_vault Javeline 1500m 100m 0.972401 0.965487 0.999932 0.998291 Long_jump 0.968269 0.958504 0.995592 0.989073 Shot_put 0.746153 0.999719 0.923606 0.991574 High_jump 0.856008 0.999101 0.978870 0.999994 400m 0.989551 0.993316 0.998505 0.834357 110m_hurdle 0.976200 0.988538 0.996809 0.983841 Discus 0.000000 0.982887 0.966980 0.975461 Pole_vault 0.982887 0.000000 0.997813 0.983763 Javeline 0.966980 0.997813 0.000000 0.967856 1500m 0.975461 0.983763 0.967856 0.000000
#préparation pour la CAH de scipy
#vectoriser la matrice des distances
from scipy.spatial.distance import squareform
VD = squareform(dist_var)
print(VD)
[0.70923746 0.9291529 0.95096979 0.77268866 0.84001839 0.97240075 0.96548678 0.99993167 0.99829125 0.98071342 0.93835906 0.74135674 0.8428406 0.96826927 0.95850391 0.99559181 0.98907308 0.79040681 0.97993848 0.96950542 0.74615315 0.9997193 0.92360602 0.9915737 0.98558877 0.94538436 0.85600757 0.99910148 0.97887048 0.99999389 0.85423903 0.98955056 0.99331582 0.99850536 0.8343575 0.97620001 0.98853761 0.99680932 0.98384082 0.98288663 0.96697968 0.97546116 0.99781269 0.98376318 0.96785583]
#CAH - Ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)
print(cah)
[[ 0. 1. 0.70923746 2. ] [ 2. 6. 0.74615315 2. ] [ 4. 10. 0.77250618 3. ] [ 3. 11. 0.84818222 3. ] [ 5. 12. 0.89326471 4. ] [ 8. 9. 0.96785583 2. ] [ 7. 15. 0.99834787 3. ] [13. 16. 1.12994439 6. ] [14. 17. 1.26176007 10. ]]
#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=deca.columns,orientation='left',color_threshold=0)
plt.show()