#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
df = pandas.read_excel("protein_dataset.xlsx",index_col = 0)
df
| RedMeat | WhiteMeat | Eggs | Milk | Fish | Cereals | Starch | Nuts | Fr_and_Veg | |
|---|---|---|---|---|---|---|---|---|---|
| Country | |||||||||
| Albania | 10.1 | 1.4 | 0.5 | 8.9 | 0.2 | 42.3 | 0.6 | 5.5 | 1.7 |
| Austria | 8.9 | 14.0 | 4.3 | 19.9 | 2.1 | 28.0 | 3.6 | 1.3 | 4.3 |
| Belgium | 13.5 | 9.3 | 4.1 | 17.5 | 4.5 | 26.6 | 5.7 | 2.1 | 4.0 |
| Bulgaria | 7.8 | 6.0 | 1.6 | 8.3 | 1.2 | 56.7 | 1.1 | 3.7 | 4.2 |
| Czechoslovakia | 9.7 | 11.4 | 2.8 | 12.5 | 2.0 | 34.3 | 5.0 | 1.1 | 4.0 |
| Denmark | 10.6 | 10.8 | 3.7 | 25.0 | 9.9 | 21.9 | 4.8 | 0.7 | 2.4 |
| E_Germany | 8.4 | 11.6 | 3.7 | 11.1 | 5.4 | 24.6 | 6.5 | 0.8 | 3.6 |
| Finland | 9.5 | 4.9 | 2.7 | 33.7 | 5.8 | 26.3 | 5.1 | 1.0 | 1.4 |
| France | 18.0 | 9.9 | 3.3 | 19.5 | 5.7 | 28.1 | 4.8 | 2.4 | 6.5 |
| Greece | 10.2 | 3.0 | 2.8 | 17.6 | 5.9 | 41.7 | 2.2 | 7.8 | 6.5 |
| Hungary | 5.3 | 12.4 | 2.9 | 9.7 | 0.3 | 40.1 | 4.0 | 5.4 | 4.2 |
| Ireland | 13.9 | 10.0 | 4.7 | 25.8 | 2.2 | 24.0 | 6.2 | 1.6 | 2.9 |
| Italy | 9.0 | 5.1 | 2.9 | 13.7 | 3.4 | 36.8 | 2.1 | 4.3 | 6.7 |
| Netherlands | 9.5 | 13.6 | 3.6 | 23.4 | 2.5 | 22.4 | 4.2 | 1.8 | 3.7 |
| Norway | 9.4 | 4.7 | 2.7 | 23.3 | 9.7 | 23.0 | 4.6 | 1.6 | 2.7 |
| Poland | 6.9 | 10.2 | 2.7 | 19.3 | 3.0 | 36.1 | 5.9 | 2.0 | 6.6 |
| Portugal | 6.2 | 3.7 | 1.1 | 4.9 | 14.2 | 27.0 | 5.9 | 4.7 | 7.9 |
| Romania | 6.2 | 6.3 | 1.5 | 11.1 | 1.0 | 49.6 | 3.1 | 5.3 | 2.8 |
| Spain | 7.1 | 3.4 | 3.1 | 8.6 | 7.0 | 29.2 | 5.7 | 5.9 | 7.2 |
| Sweden | 9.9 | 7.8 | 3.5 | 24.7 | 7.5 | 19.5 | 3.7 | 1.4 | 2.0 |
| Switzerland | 13.1 | 10.1 | 3.1 | 23.8 | 2.3 | 25.6 | 2.8 | 2.4 | 4.9 |
| UK | 17.4 | 5.7 | 4.7 | 20.6 | 4.3 | 24.3 | 4.7 | 3.4 | 3.3 |
| USSR | 9.3 | 4.6 | 2.1 | 16.6 | 3.0 | 43.6 | 6.4 | 3.4 | 2.9 |
| W_Germany | 11.4 | 12.5 | 4.1 | 18.8 | 3.4 | 18.6 | 5.2 | 1.5 | 3.8 |
| Yugoslavia | 4.4 | 5.0 | 1.2 | 9.5 | 0.6 | 55.9 | 3.0 | 5.7 | 3.2 |
#description
df.describe()
| RedMeat | WhiteMeat | Eggs | Milk | Fish | Cereals | Starch | Nuts | Fr_and_Veg | |
|---|---|---|---|---|---|---|---|---|---|
| count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
| mean | 9.828000 | 7.896000 | 2.936000 | 17.112000 | 4.284000 | 32.248000 | 4.276000 | 3.072000 | 4.136000 |
| std | 3.347078 | 3.694081 | 1.117617 | 7.105416 | 3.402533 | 10.974786 | 1.634085 | 1.985682 | 1.803903 |
| min | 4.400000 | 1.400000 | 0.500000 | 4.900000 | 0.200000 | 18.600000 | 0.600000 | 0.700000 | 1.400000 |
| 25% | 7.800000 | 4.900000 | 2.700000 | 11.100000 | 2.100000 | 24.300000 | 3.100000 | 1.500000 | 2.900000 |
| 50% | 9.500000 | 7.800000 | 2.900000 | 17.600000 | 3.400000 | 28.000000 | 4.700000 | 2.400000 | 3.800000 |
| 75% | 10.600000 | 10.800000 | 3.700000 | 23.300000 | 5.800000 | 40.100000 | 5.700000 | 4.700000 | 4.900000 |
| max | 18.000000 | 14.000000 | 4.700000 | 33.700000 | 14.200000 | 56.700000 | 6.500000 | 7.800000 | 7.900000 |
#matrice de corrélation sous forme de heatmap
import seaborn as sns
sns.heatmap(df.corr(),vmin=-1,vmax=+1,center=0,cmap=sns.color_palette('vlag',as_cmap=True))
<Axes: >
#ACP avec scientistools
from scientisttools.decomposition import PCA
from scientisttools.extractfactor import get_eig
acp = PCA(normalize=True,graph=False)
acp.fit(df)
#valeurs propres
get_eig(acp)
| eigenvalue | difference | proportion | cumulative | |
|---|---|---|---|---|
| Dim.1 | 4.006438 | 2.371438 | 44.515973 | 44.515973 |
| Dim.2 | 1.634999 | 0.507080 | 18.166661 | 62.682634 |
| Dim.3 | 1.127920 | 0.173256 | 12.532439 | 75.215073 |
| Dim.4 | 0.954664 | 0.490826 | 10.607377 | 85.822450 |
| Dim.5 | 0.463838 | 0.138707 | 5.153760 | 90.976210 |
| Dim.6 | 0.325131 | 0.053525 | 3.612566 | 94.588776 |
| Dim.7 | 0.271606 | 0.155314 | 3.017848 | 97.606624 |
| Dim.8 | 0.116292 | 0.017180 | 1.292132 | 98.898757 |
| Dim.9 | 0.099112 | NaN | 1.101243 | 100.000000 |
#soit
acp.eig_[0]
array([4.00643757, 1.63499945, 1.1279195 , 0.95466396, 0.4638384 ,
0.32513097, 0.27160633, 0.1162919 , 0.0991119 ])
#screeplot
import matplotlib.pyplot as plt
import numpy
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.')
[<matplotlib.lines.Line2D at 0x28585456980>]
#screeplot avec la règle de Kaiser-Gutman
from scientisttools.pyplot import plot_eigenvalues
plot_eigenvalues(acp,choice='eigenvalue',add_kaiser=True)
#seuil de Karlis - Saporta - Spinaki
plot_eigenvalues(acp,choice="eigenvalue",add_kss=True)
#screeplot avec les seuils des bâtons brisés
plot_eigenvalues(acp,choice='eigenvalue',add_broken_stick=True)
#principe de la perturbation aléatoire
#valeurs intra-colonnes
from sklearn.utils import shuffle
#affichage des valeurs initialement
#on accède bien au vecteur numpy sous-jacent et non
#pas à la Série "pandas" qui tient compte des index
print(df.RedMeat.values,'\n')
#réordonnancement (perturbation) aléatoire des valeurs
print(shuffle(df.RedMeat.values))
[10.1 8.9 13.5 7.8 9.7 10.6 8.4 9.5 18. 10.2 5.3 13.9 9. 9.5 9.4 6.9 6.2 6.2 7.1 9.9 13.1 17.4 9.3 11.4 4.4] [ 9.4 7.8 6.9 18. 9.5 9.5 5.3 9. 13.1 10.6 10.1 4.4 9.7 7.1 13.5 17.4 8.9 9.3 10.2 13.9 11.4 6.2 9.9 8.4 6.2]
#traitement du data frame
#perturbation aléatoire colonne par colonne, indépendamment les unes des autres
dfRnd = df.apply(axis=0,func=lambda x:shuffle(x.values))
#affichage
dfRnd
| RedMeat | WhiteMeat | Eggs | Milk | Fish | Cereals | Starch | Nuts | Fr_and_Veg | |
|---|---|---|---|---|---|---|---|---|---|
| Country | |||||||||
| Albania | 11.4 | 6.0 | 4.7 | 8.3 | 3.0 | 40.1 | 4.2 | 5.5 | 3.8 |
| Austria | 6.9 | 1.4 | 3.7 | 18.8 | 0.6 | 26.6 | 2.2 | 0.7 | 4.3 |
| Belgium | 7.8 | 6.3 | 3.1 | 17.6 | 4.5 | 19.5 | 5.9 | 1.5 | 2.4 |
| Bulgaria | 9.7 | 9.3 | 1.5 | 25.8 | 2.0 | 18.6 | 3.1 | 1.3 | 4.0 |
| Czechoslovakia | 13.9 | 11.4 | 4.3 | 11.1 | 2.3 | 29.2 | 4.7 | 5.7 | 6.5 |
| Denmark | 9.5 | 11.6 | 1.6 | 19.3 | 5.7 | 22.4 | 6.2 | 1.4 | 3.6 |
| E_Germany | 18.0 | 4.9 | 1.2 | 16.6 | 0.2 | 24.0 | 6.5 | 1.1 | 7.9 |
| Finland | 9.9 | 10.1 | 2.1 | 19.9 | 1.2 | 26.3 | 3.0 | 0.8 | 2.7 |
| France | 8.9 | 10.2 | 3.6 | 8.9 | 3.0 | 25.6 | 2.8 | 5.3 | 1.4 |
| Greece | 7.1 | 12.4 | 2.9 | 9.7 | 5.9 | 43.6 | 6.4 | 2.0 | 3.7 |
| Hungary | 4.4 | 5.1 | 2.8 | 8.6 | 4.3 | 23.0 | 4.8 | 2.1 | 2.9 |
| Ireland | 10.1 | 4.6 | 2.9 | 23.3 | 3.4 | 36.8 | 4.6 | 2.4 | 6.5 |
| Italy | 10.2 | 5.0 | 3.3 | 12.5 | 2.2 | 28.0 | 5.1 | 4.7 | 3.3 |
| Netherlands | 9.5 | 3.4 | 4.1 | 4.9 | 14.2 | 49.6 | 1.1 | 7.8 | 4.0 |
| Norway | 9.0 | 12.5 | 4.7 | 11.1 | 5.8 | 36.1 | 5.7 | 3.7 | 2.8 |
| Poland | 6.2 | 3.0 | 3.5 | 9.5 | 5.4 | 27.0 | 3.6 | 1.6 | 6.6 |
| Portugal | 10.6 | 10.0 | 1.1 | 23.8 | 1.0 | 42.3 | 2.1 | 3.4 | 2.0 |
| Romania | 8.4 | 9.9 | 2.8 | 23.4 | 7.5 | 56.7 | 5.2 | 2.4 | 6.7 |
| Spain | 9.4 | 10.8 | 2.7 | 13.7 | 0.3 | 24.6 | 4.8 | 1.8 | 4.9 |
| Sweden | 9.3 | 3.7 | 2.7 | 17.5 | 2.5 | 24.3 | 4.0 | 5.9 | 3.2 |
| Switzerland | 5.3 | 5.7 | 4.1 | 25.0 | 3.4 | 34.3 | 5.7 | 1.0 | 2.9 |
| UK | 6.2 | 14.0 | 0.5 | 24.7 | 2.1 | 41.7 | 5.9 | 1.6 | 4.2 |
| USSR | 17.4 | 13.6 | 3.7 | 20.6 | 9.9 | 21.9 | 0.6 | 4.3 | 4.2 |
| W_Germany | 13.1 | 4.7 | 2.7 | 33.7 | 9.7 | 55.9 | 5.0 | 3.4 | 7.2 |
| Yugoslavia | 13.5 | 7.8 | 3.1 | 19.5 | 7.0 | 28.1 | 3.7 | 5.4 | 1.7 |
#description - les distributions individuelles sont conservées
dfRnd.describe()
| RedMeat | WhiteMeat | Eggs | Milk | Fish | Cereals | Starch | Nuts | Fr_and_Veg | |
|---|---|---|---|---|---|---|---|---|---|
| count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
| mean | 9.828000 | 7.896000 | 2.936000 | 17.112000 | 4.284000 | 32.248000 | 4.276000 | 3.072000 | 4.136000 |
| std | 3.347078 | 3.694081 | 1.117617 | 7.105416 | 3.402533 | 10.974786 | 1.634085 | 1.985682 | 1.803903 |
| min | 4.400000 | 1.400000 | 0.500000 | 4.900000 | 0.200000 | 18.600000 | 0.600000 | 0.700000 | 1.400000 |
| 25% | 7.800000 | 4.900000 | 2.700000 | 11.100000 | 2.100000 | 24.300000 | 3.100000 | 1.500000 | 2.900000 |
| 50% | 9.500000 | 7.800000 | 2.900000 | 17.600000 | 3.400000 | 28.000000 | 4.700000 | 2.400000 | 3.800000 |
| 75% | 10.600000 | 10.800000 | 3.700000 | 23.300000 | 5.800000 | 40.100000 | 5.700000 | 4.700000 | 4.900000 |
| max | 18.000000 | 14.000000 | 4.700000 | 33.700000 | 14.200000 | 56.700000 | 6.500000 | 7.800000 | 7.900000 |
#heatmap des corrélations - mais les relations entre les variables non
sns.heatmap(dfRnd.corr(),vmin=-1,vmax=+1,center=0,cmap=sns.color_palette('vlag',as_cmap=True))
<Axes: >
#nombre de répétition
nrep = 200
#préparation - matrices des valeurs propres
mat_vp = numpy.zeros(shape=(nrep,9))
#itérer nboot fois
for i in range(nrep):
#créer une version H0 des données
dfAlea = df.apply(axis=0,func=lambda x:shuffle(x.values))
#lancer l'acp
pca = PCA(normalize=True,graph=False)
pca.fit(dfAlea)
#récupération des val.p
mat_vp[i,:] = pca.eig_[0]
#affichage des 5 premiers résultats
mat_vp[:5,:]
array([[2.29169811, 1.78887337, 1.41403348, 1.0176063 , 0.86197047,
0.56080423, 0.47998312, 0.34302169, 0.24200923],
[2.00492809, 1.66978688, 1.17539514, 1.16402752, 1.09900473,
0.69226665, 0.45470484, 0.40877623, 0.33110992],
[2.26842845, 1.65534782, 1.20590112, 1.0001701 , 0.87921409,
0.71737244, 0.54501328, 0.40968995, 0.31886276],
[2.30153973, 1.66069767, 1.30808683, 1.06596455, 0.88945257,
0.66984338, 0.49906255, 0.40709903, 0.19825369],
[2.25224159, 1.63364472, 1.19581738, 1.05669436, 0.92622841,
0.77182289, 0.60094526, 0.37532032, 0.18728508]])
#seuil = moyennes
moy_vp = numpy.mean(mat_vp,axis=0)
print(moy_vp)
[2.04828531 1.64488327 1.35706796 1.11713157 0.92025007 0.72333076 0.54517965 0.39791787 0.24595354]
#graphique
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.',label='Val.p - Observé')
plt.plot(numpy.arange(1,10),moy_vp,c='red',linestyle='--',label='Val.p - Moyenne RND')
plt.legend()
plt.show()
#principe du tirage avec remise
dfBoot = df.sample(frac=1,replace=True)
#affichage
dfBoot
| RedMeat | WhiteMeat | Eggs | Milk | Fish | Cereals | Starch | Nuts | Fr_and_Veg | |
|---|---|---|---|---|---|---|---|---|---|
| Country | |||||||||
| Czechoslovakia | 9.7 | 11.4 | 2.8 | 12.5 | 2.0 | 34.3 | 5.0 | 1.1 | 4.0 |
| Czechoslovakia | 9.7 | 11.4 | 2.8 | 12.5 | 2.0 | 34.3 | 5.0 | 1.1 | 4.0 |
| Romania | 6.2 | 6.3 | 1.5 | 11.1 | 1.0 | 49.6 | 3.1 | 5.3 | 2.8 |
| Spain | 7.1 | 3.4 | 3.1 | 8.6 | 7.0 | 29.2 | 5.7 | 5.9 | 7.2 |
| Denmark | 10.6 | 10.8 | 3.7 | 25.0 | 9.9 | 21.9 | 4.8 | 0.7 | 2.4 |
| Yugoslavia | 4.4 | 5.0 | 1.2 | 9.5 | 0.6 | 55.9 | 3.0 | 5.7 | 3.2 |
| Denmark | 10.6 | 10.8 | 3.7 | 25.0 | 9.9 | 21.9 | 4.8 | 0.7 | 2.4 |
| Austria | 8.9 | 14.0 | 4.3 | 19.9 | 2.1 | 28.0 | 3.6 | 1.3 | 4.3 |
| Netherlands | 9.5 | 13.6 | 3.6 | 23.4 | 2.5 | 22.4 | 4.2 | 1.8 | 3.7 |
| Spain | 7.1 | 3.4 | 3.1 | 8.6 | 7.0 | 29.2 | 5.7 | 5.9 | 7.2 |
| Netherlands | 9.5 | 13.6 | 3.6 | 23.4 | 2.5 | 22.4 | 4.2 | 1.8 | 3.7 |
| W_Germany | 11.4 | 12.5 | 4.1 | 18.8 | 3.4 | 18.6 | 5.2 | 1.5 | 3.8 |
| W_Germany | 11.4 | 12.5 | 4.1 | 18.8 | 3.4 | 18.6 | 5.2 | 1.5 | 3.8 |
| Poland | 6.9 | 10.2 | 2.7 | 19.3 | 3.0 | 36.1 | 5.9 | 2.0 | 6.6 |
| Sweden | 9.9 | 7.8 | 3.5 | 24.7 | 7.5 | 19.5 | 3.7 | 1.4 | 2.0 |
| UK | 17.4 | 5.7 | 4.7 | 20.6 | 4.3 | 24.3 | 4.7 | 3.4 | 3.3 |
| Finland | 9.5 | 4.9 | 2.7 | 33.7 | 5.8 | 26.3 | 5.1 | 1.0 | 1.4 |
| Yugoslavia | 4.4 | 5.0 | 1.2 | 9.5 | 0.6 | 55.9 | 3.0 | 5.7 | 3.2 |
| W_Germany | 11.4 | 12.5 | 4.1 | 18.8 | 3.4 | 18.6 | 5.2 | 1.5 | 3.8 |
| Albania | 10.1 | 1.4 | 0.5 | 8.9 | 0.2 | 42.3 | 0.6 | 5.5 | 1.7 |
| Austria | 8.9 | 14.0 | 4.3 | 19.9 | 2.1 | 28.0 | 3.6 | 1.3 | 4.3 |
| France | 18.0 | 9.9 | 3.3 | 19.5 | 5.7 | 28.1 | 4.8 | 2.4 | 6.5 |
| Belgium | 13.5 | 9.3 | 4.1 | 17.5 | 4.5 | 26.6 | 5.7 | 2.1 | 4.0 |
| Albania | 10.1 | 1.4 | 0.5 | 8.9 | 0.2 | 42.3 | 0.6 | 5.5 | 1.7 |
| Finland | 9.5 | 4.9 | 2.7 | 33.7 | 5.8 | 26.3 | 5.1 | 1.0 | 1.4 |
#nombre de répétition
nboot = 500
#préparation - matrices des valeurs propres
mat_boot = numpy.zeros(shape=(nboot,9))
#itérer nboot fois
for i in range(nboot):
#créer une version H0 des données
dfBoot = df.sample(frac=1,replace=True)
#lancer l'acp
pca = PCA(normalize=True,graph=False)
pca.fit(dfBoot)
#récupération des val.p
mat_boot[i,:] = pca.eig_[0]
#affichage des 5 premiers résultats
mat_boot[:5,:]
array([[4.04353039, 1.53328544, 1.23689395, 0.98737355, 0.40636245,
0.33231912, 0.251321 , 0.11617594, 0.09273816],
[3.98986081, 1.57348471, 1.38666231, 0.84899536, 0.53824047,
0.30683434, 0.2159775 , 0.07494269, 0.0650018 ],
[4.80264632, 1.53365625, 1.21119162, 0.55915084, 0.38926604,
0.2893321 , 0.09236361, 0.07035986, 0.05203337],
[3.67182943, 2.17421019, 1.26138634, 0.82150427, 0.43123922,
0.37154887, 0.15532703, 0.09784009, 0.01511456],
[3.96151845, 1.8516949 , 1.35802687, 0.65134699, 0.50655024,
0.25474757, 0.23734977, 0.12473749, 0.05402772]])
#quantiles d'ordre 5% et 95%
#bornes basses et hautes
bb_boot = numpy.quantile(mat_boot,q=0.05,axis=0)
bh_boot = numpy.quantile(mat_boot,q=0.95,axis=0)
#affichages
print(bb_boot,'\n')
print(bh_boot)
[3.58440735 1.39915547 0.89028728 0.55457512 0.31259333 0.18764714 0.09233864 0.03805161 0.01045625] [4.88977711 2.2379319 1.47530278 1.05533875 0.67821651 0.39396379 0.25822621 0.13523129 0.07764064]
#graphique, "intervalle de confiance" quantile
#observé
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.')
#bornes et intervalle
plt.plot(numpy.arange(1,10),bb_boot,marker='.')
plt.plot(numpy.arange(1,10),bh_boot,marker='.')
plt.fill_between(x=numpy.arange(1,10),y1=bb_boot,y2=bh_boot,alpha=0.5)
#seuil Kaiser-Gutman
plt.plot(numpy.arange(1,10),numpy.ones(9),linestyle='--',c='red')
plt.show()