#changer de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement
import pandas
D = pandas.read_excel("autos-acp-partial.xlsx",index_col=0)
print(D)
CYL PUISS LONG LARG POIDS VMAX Modele Alfasud TI 1350 79 393 161 870 165 Audi 100 1588 85 468 177 1110 160 Simca 1300 1294 68 424 168 1050 152 Citroen GS Club 1222 59 412 161 930 151 Fiat 132 1585 98 439 164 1105 165 Lancia Beta 1297 82 429 169 1080 160 Peugeot 504 1796 79 449 169 1160 154 Renault 16 TL 1565 55 424 163 1010 140 Renault 30 2664 128 452 173 1320 180 Toyota Corolla 1166 55 399 157 815 140 Alfetta 1.66 1570 109 428 162 1060 175 Princess 1800 1798 82 445 172 1160 158 Datsun 200L 1998 115 469 169 1370 160 Taunus 2000 1993 98 438 170 1080 167 Rancho 1442 80 431 166 1129 144 Mazda 9295 1769 83 440 165 1095 165 Opel Rekord 1979 100 459 173 1120 173 Lada 1300 1294 68 404 161 955 140
#corrélations croisées
R = D.corr().values
print(R)
[[1. 0.79662771 0.70146192 0.62975716 0.78895203 0.66493402] [0.79662771 1. 0.64136235 0.52083197 0.765293 0.84437948] [0.70146192 0.64136235 1. 0.84926635 0.86809028 0.47592847] [0.62975716 0.52083197 0.84926635 1. 0.71687392 0.47294527] [0.78895203 0.765293 0.86809028 0.71687392 1. 0.4775956 ] [0.66493402 0.84437948 0.47592847 0.47294527 0.4775956 1. ]]
#heatmap
#parfois heatmap sur le carré de la corrélation est plus pertinente
import seaborn as sns
sns.heatmap(R,vmin=-1,vmax=+1,center=0,
cmap=sns.diverging_palette(10,250,as_cmap=True),linewidths=0.1,annot=True,
xticklabels=D.columns,yticklabels=D.columns)
<Axes: >
#installation de la librairie si nécessaire
#!pip install fanalysis
#ACP - classe de calcul
from fanalysis.pca import PCA
#instanciation
acp = PCA(std_unit=True,row_labels=D.index,col_labels=D.columns)
#entrainement
acp.fit(D.values)
#valeurs propres
acp.eig_
array([[4.42085806e+00, 8.56062289e-01, 3.73066077e-01, 2.13922089e-01, 9.28012120e-02, 4.32902727e-02], [7.36809677e+01, 1.42677048e+01, 6.21776796e+00, 3.56536815e+00, 1.54668687e+00, 7.21504545e-01], [7.36809677e+01, 8.79486725e+01, 9.41664404e+01, 9.77318086e+01, 9.92784955e+01, 1.00000000e+02]])
#éboulis des valeurs propres
acp.plot_eigenvalues()
#cercle des corrélations
acp.correlation_circle(num_x_axis=1,num_y_axis=2)
#carte des individus
#attention, la dispersion en abscisse
#est 5 fois plus élevée qu'en ordonnée
acp.mapping_row(num_x_axis=1,num_y_axis=2)
Carte des individus plus réaliste, pour ne pas être trompé par la mise à l'échelle automatique du graphique.
#coordonnées dans le plan
coord = acp.row_topandas()[['row_coord_dim1', 'row_coord_dim2']]
#création des axes de représentation
import matplotlib.pyplot as plt
axe = plt.gca()
#délimiter
axe.axis([-5,+5,-5,+5])
axe.plot([-5,+5],[0,0],color='silver',linestyle='--')
axe.plot([0,0],[-5,+5],color='silver',linestyle='--')
axe.set_xlabel("Dim.1")
axe.set_ylabel("Dim.2")
plt.title("Carte des individus")
for lbl,x,y in zip(coord.index,coord.iloc[:,0],coord.iloc[:,1]):
axe.text(x,y,lbl,fontsize=7,horizontalalignment="center",verticalalignment="center")
plt.show()
Corrélations partielles par paires de variables
#dimensions - nombre d'observations
n = D.shape[0]
print(n)
#dimensions - nombre de variables
p = D.shape[1]
print(p)
18 6
#inverse de la matrice des corrélations
import numpy
invCorr = numpy.linalg.inv(R)
print(invCorr)
[[ 3.77201352 -0.69347038 0.31247993 -0.43395472 -1.96161216 -0.92921187] [-0.69347038 11.1188198 0.74480091 2.28233732 -6.86128033 -7.08436656] [ 0.31247993 0.74480091 7.20419514 -3.19842205 -4.48616669 -0.61010269] [-0.43395472 2.28233732 -3.19842205 4.19760475 -0.82030424 -1.70985103] [-1.96161216 -6.86128033 -4.48616669 -0.82030424 9.95728271 4.86536606] [-0.92921187 -7.08436656 -0.61010269 -1.70985103 4.86536606 6.37511213]]
#produit de la diagonale
prodDiag = numpy.dot(numpy.diag(invCorr).reshape(p,1),numpy.diag(invCorr).reshape(1,p))
#corrélation partielle
RHO = -1.0*invCorr/numpy.sqrt(prodDiag)
#corriger la diagonale -- mettre 1
numpy.fill_diagonal(RHO,1.0)
#affichage
print(RHO)
[[ 1. 0.10708089 -0.05994359 0.10905787 0.32007821 0.18948909] [ 0.10708089 1. -0.08321816 -0.33407941 0.65208679 0.84144893] [-0.05994359 -0.08321816 1. 0.58162393 0.52967839 0.09002566] [ 0.10905787 -0.33407941 0.58162393 1. 0.1268831 0.33053206] [ 0.32007821 0.65208679 0.52967839 0.1268831 1. -0.61066241] [ 0.18948909 0.84144893 0.09002566 0.33053206 -0.61066241 1. ]]
#heatmap
plt.title("Matrice des corrélations partielles")
sns.heatmap(RHO,vmin=-1,vmax=+1,center=0,
cmap=sns.diverging_palette(10,250,as_cmap=True),linewidths=0.1,annot=True,
xticklabels=D.columns,yticklabels=D.columns)
plt.show()
#indice KMO, corrélations brutes vs. partielles
#indicateur de redondance des données
KMO = (numpy.sum(R**2)-p)/((numpy.sum(R**2)-p)+(numpy.sum(RHO**2)-p))
print(KMO)
0.7400088239713852
#par variable
KMO_j = (numpy.sum(R**2,axis=0)-1)/((numpy.sum(R**2,axis=0)-1)+(numpy.sum(RHO**2,axis=0)-1))
#struture pandas
dfKMO_j = pandas.DataFrame(KMO_j,index=D.columns,columns=['kmo'])
#tri de manière décroissante
dfKMO_j = dfKMO_j.sort_values(by='kmo',ascending=False)
print(dfKMO_j)
kmo CYL 0.939956 LONG 0.803384 LARG 0.783650 POIDS 0.693091 PUISS 0.674346 VMAX 0.597664
#genre avec une représentation graphique
sns.barplot(data=dfKMO_j,y=dfKMO_j.index,x='kmo',orient='h',color='skyblue')
<Axes: xlabel='kmo'>
Objectif : effectuer une analyse "à cylindrée égale" (c.-à-d. contrôler l'effet de la cylindrée dans les corrélations.)
#installation du package pingouin (à faire une seule fois)
#site web : https://pingouin-stats.org/build/html/index.html
#!pip install pingouin
#vérification de la version
import pingouin as pg
pg.__version__
'0.5.3'
#calcul des corrélations par paires entre les variables
#en contrôlant l'effet de "cylindrée"
Pcorr = pg.pairwise_corr(D,columns=D.columns,covar="CYL")
print(Pcorr)
X Y method covar alternative n r CI95% \ 0 PUISS LONG pearson ['CYL'] two-sided 18 0.191635 [-0.32, 0.62] 1 PUISS LARG pearson ['CYL'] two-sided 18 0.040784 [-0.45, 0.51] 2 PUISS POIDS pearson ['CYL'] two-sided 18 0.368295 [-0.14, 0.72] 3 PUISS VMAX pearson ['CYL'] two-sided 18 0.696984 [0.33, 0.88] 4 LONG LARG pearson ['CYL'] two-sided 18 0.736086 [0.4, 0.9] 5 LONG POIDS pearson ['CYL'] two-sided 18 0.718547 [0.36, 0.89] 6 LONG VMAX pearson ['CYL'] two-sided 18 0.017851 [-0.47, 0.49] 7 LARG POIDS pearson ['CYL'] two-sided 18 0.460976 [-0.03, 0.77] 8 LARG VMAX pearson ['CYL'] two-sided 18 0.093415 [-0.41, 0.55] 9 POIDS VMAX pearson ['CYL'] two-sided 18 -0.102422 [-0.56, 0.4] p-unc 0 0.461233 1 0.876497 2 0.145790 3 0.001875 4 0.000755 5 0.001156 6 0.945785 7 0.062554 8 0.721388 9 0.695677
c:\Users\ricco\anaconda3\lib\site-packages\pingouin\pairwise.py:1429: FutureWarning: In a future version of pandas all arguments of DataFrame.any and Series.any will be keyword-only. stats = stats[~stats[["X", "Y"]].isin(covar).any(1)]
#corrélations croisées sous forme de vecteur
#on n'a que la partie triangulaire sup. ici
Pcorr.r
0 0.191635 1 0.040784 2 0.368295 3 0.696984 4 0.736086 5 0.718547 6 0.017851 7 0.460976 8 0.093415 9 -0.102422 Name: r, dtype: float64
#mettre sous forme de matrice carrée
from scipy.spatial.distance import squareform
MPcorr = squareform(Pcorr.r)
print(MPcorr)
[[ 0. 0.19163511 0.04078385 0.36829487 0.69698446] [ 0.19163511 0. 0.736086 0.71854683 0.01785118] [ 0.04078385 0.736086 0. 0.46097645 0.09341514] [ 0.36829487 0.71854683 0.46097645 0. -0.1024223 ] [ 0.69698446 0.01785118 0.09341514 -0.1024223 0. ]]
#compléter la diagonale
numpy.fill_diagonal(MPcorr,1)
print(MPcorr)
[[ 1. 0.19163511 0.04078385 0.36829487 0.69698446] [ 0.19163511 1. 0.736086 0.71854683 0.01785118] [ 0.04078385 0.736086 1. 0.46097645 0.09341514] [ 0.36829487 0.71854683 0.46097645 1. -0.1024223 ] [ 0.69698446 0.01785118 0.09341514 -0.1024223 1. ]]
#nouveau heatmap
#corrélations croisées, contrôlées par la cylindrée
plt.title("Corrélations conditionnellement à CYLINDREE")
sns.heatmap(MPcorr,vmin=-1,vmax=+1,center=0,
cmap=sns.diverging_palette(10,250,as_cmap=True),linewidths=0.1,annot=True,
xticklabels=D.columns[1:],yticklabels=D.columns[1:])
plt.show()
#acp à partir de la matrice des corrélations partielles
#on passe par la diagonalisation de la matrice des corrélations
valp,vecp = numpy.linalg.eig(MPcorr)
#affichage des valeurs propres
print(valp)
[2.40105829 1.61317102 0.69868647 0.10359165 0.18349256]
#les vecteurs propres
print(vecp)
[[ 0.31326136 0.62974607 0.34205162 0.595848 0.18235421] [ 0.58263803 -0.21039099 -0.08188603 0.19446152 -0.75614016] [ 0.501658 -0.21362183 -0.6150808 0.14670274 0.55032615] [ 0.53469139 -0.14608189 0.58308243 -0.53827801 0.25107142] [ 0.15762171 0.70155635 -0.39747435 -0.54395309 -0.17059678]]
#corrélations des variables avec les composantes
corrComp = vecp[:,:2]*numpy.sqrt(valp[:2])
print(corrComp)
[[ 0.4854094 0.7998447 ] [ 0.90281794 -0.26721901] [ 0.77733656 -0.27132252] [ 0.82852296 -0.18553959] [ 0.24424034 0.89105142]]
#cercle des corrélations
fig,axe = plt.subplots(figsize=(5,5))
#délimiter
axe.axis([-1,+1,-1,+1])
axe.plot([-1,+1],[0,0],color='silver',linestyle='--')
axe.plot([0,0],[-1,+1],color='silver',linestyle='--')
axe.set_xlabel("Dim.1")
axe.set_ylabel("Dim.2")
plt.title("Cercle des corrélations")
for lbl,x,y in zip(D.columns[1:],corrComp[:,0],corrComp[:,1]):
plt.arrow(0,0,x*0.9,y*0.9,head_width=0.03)
axe.text(x,y,lbl,fontsize=7,color='blue',horizontalalignment="center",verticalalignment="center")
axe.add_artist(plt.Circle((0,0),1,fill=False,color='grey'))
plt.show()
#rotation des facteurs pour mieux faire la correspondance avec les variables
from statsmodels.multivariate.factor_rotation import rotate_factors
#rotation varimax - mieux discerner la contribution des variables aux facteurs
L, T = rotate_factors(corrComp,method='varimax')
#corrélations après rotation
corrRotate = L
print(corrRotate)
[[ 0.18578012 0.91698396] [ 0.93996158 0.05439183] [ 0.8232883 0.00802631] [ 0.84239124 0.10607648] [-0.07202636 0.92110704]]
#nouveau cercle des corrélations
#cercle des corrélations
fig,axe = plt.subplots(figsize=(5,5))
#délimiter
axe.axis([-1,+1,-1,+1])
axe.plot([-1,+1],[0,0],color='silver',linestyle='--')
axe.plot([0,0],[-1,+1],color='silver',linestyle='--')
axe.set_xlabel("Dim.1")
axe.set_ylabel("Dim.2")
plt.title("Cercle des corrélations après rotation varimax")
for lbl,x,y in zip(D.columns[1:],corrRotate[:,0],corrRotate[:,1]):
plt.arrow(0,0,x*0.9,y*0.9,head_width=0.03)
axe.text(x,y,lbl,fontsize=7,color='blue',horizontalalignment="center",verticalalignment="center")
axe.add_artist(plt.Circle((0,0),1,fill=False,color='grey'))
plt.show()
#obtenir les scores des variables relativement à cylindrée
#c-à-d. le résidu de la régression sur cylindrée
from numpy import polyval,polyfit
scores = D[D.columns[1:]].apply(axis=0,func=lambda y:y-polyval(polyfit(x=D.CYL,y=y,deg=1),D.CYL))
print(scores)
PUISS LONG LARG POIDS VMAX Modele Alfasud TI 6.616050 -28.818832 -3.146005 -127.441146 12.802960 Audi 100 2.284461 36.310927 10.724111 43.784864 2.664916 Simca 1300 -1.952987 4.503578 4.355145 68.740969 1.011911 Citroen GS Club -7.827464 -4.510467 -2.000521 -30.453454 1.566277 Fiat 132 15.414691 7.435341 -2.249042 39.651763 7.729682 Lancia Beta 11.916782 9.379163 5.328297 97.874070 8.947146 Peugeot 504 -12.744828 8.684833 0.862699 33.679864 -7.825474 Renault 16 TL -26.717108 -6.735226 -3.070060 -49.568910 -16.838550 Renault 30 -1.424743 -24.312518 -2.905116 -57.142923 -0.564220 Toyota Corolla -9.396502 -15.188057 -5.499371 -129.271339 -8.224772 Alfetta 1.66 27.065841 -2.942584 -4.114806 -1.013742 18.053508 Princess 1800 -9.831648 4.601890 3.844800 33.101931 -3.868651 Datsun 200L 14.486344 20.307569 -0.945019 185.308663 -6.186334 Taunus 2000 -2.296606 -10.485073 0.099727 -103.246506 0.921608 Rancho 3.622327 5.365781 1.030678 104.973950 -10.183175 Mazda 9295 -7.572757 0.804567 -2.895676 -23.518045 3.757413 Opel Rekord 0.311135 11.095530 3.225014 -59.200977 7.223845 Lada 1300 -1.952987 -15.496422 -2.644855 -26.259031 -10.988089
#moyennes des scores par variable
scores.mean()
PUISS 8.684411e-15 LONG 1.010550e-13 LARG 1.894781e-14 POIDS 2.589534e-13 VMAX -6.315935e-15 dtype: float64
#écarts-type des scores par variables
scores.std()
PUISS 12.316855 LONG 15.756068 LARG 4.127632 POIDS 84.154382 VMAX 9.067677 dtype: float64
#centrage et réduction pour la projection
#mais elles sont déjà centrées en réalité (puisqu'on travaille sur des résidus)
#importation de la classe
from sklearn.preprocessing import StandardScaler
#instanctiation
std = StandardScaler(with_mean=False, with_std=True)
#valeurs standardisées
Z = std.fit_transform(scores)
print(Z)
[[ 0.55272709 -1.88208975 -0.78427835 -1.55827724 1.45286785] [ 0.19085152 2.37138074 2.6734504 0.5353762 0.30241222] [-0.16315913 0.29411802 1.08570898 0.84052514 0.11483071] [-0.65393269 -0.29456791 -0.49871665 -0.37236737 0.17773964] [ 1.28779509 0.48558457 -0.56067147 0.48483901 0.87715702] [ 0.99556807 0.61253096 1.32830956 1.19674798 1.0153137 ] [-1.06474576 0.56718592 0.21506511 0.41181806 -0.88802748] [-2.23203705 -0.43986171 -0.76534589 -0.6061002 -1.91082286] [-0.11902783 -1.58779305 -0.72422643 -0.69871088 -0.06402716] [-0.78501538 -0.99189608 -1.37095717 -1.58065578 -0.93333938] [ 2.26117137 -0.19217323 -1.02579409 -0.01239546 2.04869514] [-0.82136891 0.30053856 0.95848353 0.40475143 -0.4390109 ] [ 1.21023787 1.32623933 -0.23558695 2.26584803 -0.70201943] [-0.19186617 -0.68475529 0.02486124 -1.26243905 0.10458317] [ 0.30262134 0.35042645 0.25694138 1.28356124 -1.15557711] [-0.63265355 0.05254434 -0.72187298 -0.28756516 0.4263877 ] [ 0.02599324 0.72462282 0.80397488 -0.72387559 0.81975521] [-0.16315913 -1.01203468 -0.65934511 -0.32108037 -1.24691804]]
#calcul des coordonnées factorielles
#à partir des données transformées et des vecteurs propres
coordFact = numpy.dot(Z,vecp[:,:2])
print(coordFact)
[[-1.92106244 2.15849617] [ 3.11652858 -0.81588609] [ 1.13242886 -0.43878504] [-0.79774951 -0.06420984] [ 0.80256939 1.37314182] [ 2.1350403 0.75180472] [ 0.18503267 -1.51895334] [-1.96469676 -2.40158866] [-1.70939618 0.47096097] [-2.50386059 -0.41669294] [ 0.39806347 3.12261315] [ 0.54585244 -1.1523552 ] [ 2.1345301 -0.2900653 ] [-1.1051276 0.27571894] [ 0.93203315 -0.93624777] [-0.61625559 0.08588495] [ 0.5758171 0.37301803] [-1.33974736 -0.57685457]]
#matrice de changement de repères (rotation varimax)
matTrans = T
#appliquer la correction
coordRotate = numpy.dot(coordFact,matTrans)
print(coordRotate)
[[-2.53864696 1.38016601] [ 3.20865509 0.2880157 ] [ 1.21411349 -0.02925506] [-0.72883885 -0.33063746] [ 0.28999624 1.56382169] [ 1.75416214 1.43056723] [ 0.68861261 -1.36648052] [-1.03505373 -2.9251197 ] [-1.76787115 -0.13590862] [-2.21469199 -1.24019767] [-0.68319885 3.07284994] [ 0.90392309 -0.89933333] [ 2.10659713 0.45011683] [-1.13319046 -0.1149232 ] [ 1.19407131 -0.56518969] [-0.60891629 -0.127938 ] [ 0.41542315 0.54601409] [-1.06514597 -0.99656824]]
#pour dessiner des Ellipses
from matplotlib.patches import Ellipse
#projection des individus dans le premier plan factoriel
fig,axe = plt.subplots(figsize=(7,7))
#délimiter
axe.axis([-3.5,+3.5,-3.5,+3.5])
axe.plot([-3.5,+3.5],[0,0],color='silver',linestyle='--')
axe.plot([0,0],[-3.5,+3.5],color='silver',linestyle='--')
axe.set_xlabel("Dim.1")
axe.set_ylabel("Dim.2")
plt.title("Carte des individus conditionnellement à CYL")
for lbl,x,y in zip(D.index,coordRotate[:,0],coordRotate[:,1]):
axe.text(x,y,lbl,fontsize=7,horizontalalignment="center",verticalalignment="center")
#cercler les 2 véhicules qui nous intéressent, la Renaut 16 et la Renault 30
axe.add_artist(Ellipse((coordRotate[7,0],coordRotate[7,1]),1,0.5,fill=False,color='darkorange'))
axe.add_artist(Ellipse((coordRotate[8,0],coordRotate[8,1]),1,0.5,fill=False,color='darkviolet'))
plt.show()
Comment faire si on souhaite projeter des individus supplémentaires ?...