Importation et caractéristiques des données¶

Caractéristiques initiales du jeu de données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
df = pandas.read_excel("protein_dataset.xlsx",index_col = 0)
df
Out[ ]:
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr_and_Veg
Country
Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0
Bulgaria 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2
Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
E_Germany 8.4 11.6 3.7 11.1 5.4 24.6 6.5 0.8 3.6
Finland 9.5 4.9 2.7 33.7 5.8 26.3 5.1 1.0 1.4
France 18.0 9.9 3.3 19.5 5.7 28.1 4.8 2.4 6.5
Greece 10.2 3.0 2.8 17.6 5.9 41.7 2.2 7.8 6.5
Hungary 5.3 12.4 2.9 9.7 0.3 40.1 4.0 5.4 4.2
Ireland 13.9 10.0 4.7 25.8 2.2 24.0 6.2 1.6 2.9
Italy 9.0 5.1 2.9 13.7 3.4 36.8 2.1 4.3 6.7
Netherlands 9.5 13.6 3.6 23.4 2.5 22.4 4.2 1.8 3.7
Norway 9.4 4.7 2.7 23.3 9.7 23.0 4.6 1.6 2.7
Poland 6.9 10.2 2.7 19.3 3.0 36.1 5.9 2.0 6.6
Portugal 6.2 3.7 1.1 4.9 14.2 27.0 5.9 4.7 7.9
Romania 6.2 6.3 1.5 11.1 1.0 49.6 3.1 5.3 2.8
Spain 7.1 3.4 3.1 8.6 7.0 29.2 5.7 5.9 7.2
Sweden 9.9 7.8 3.5 24.7 7.5 19.5 3.7 1.4 2.0
Switzerland 13.1 10.1 3.1 23.8 2.3 25.6 2.8 2.4 4.9
UK 17.4 5.7 4.7 20.6 4.3 24.3 4.7 3.4 3.3
USSR 9.3 4.6 2.1 16.6 3.0 43.6 6.4 3.4 2.9
W_Germany 11.4 12.5 4.1 18.8 3.4 18.6 5.2 1.5 3.8
Yugoslavia 4.4 5.0 1.2 9.5 0.6 55.9 3.0 5.7 3.2
In [ ]:
#description
df.describe()
Out[ ]:
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr_and_Veg
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 9.828000 7.896000 2.936000 17.112000 4.284000 32.248000 4.276000 3.072000 4.136000
std 3.347078 3.694081 1.117617 7.105416 3.402533 10.974786 1.634085 1.985682 1.803903
min 4.400000 1.400000 0.500000 4.900000 0.200000 18.600000 0.600000 0.700000 1.400000
25% 7.800000 4.900000 2.700000 11.100000 2.100000 24.300000 3.100000 1.500000 2.900000
50% 9.500000 7.800000 2.900000 17.600000 3.400000 28.000000 4.700000 2.400000 3.800000
75% 10.600000 10.800000 3.700000 23.300000 5.800000 40.100000 5.700000 4.700000 4.900000
max 18.000000 14.000000 4.700000 33.700000 14.200000 56.700000 6.500000 7.800000 7.900000
In [ ]:
#matrice de corrélation sous forme de heatmap
import seaborn as sns
sns.heatmap(df.corr(),vmin=-1,vmax=+1,center=0,cmap=sns.color_palette('vlag',as_cmap=True))
Out[ ]:
<Axes: >

ACP avec "scientisttools"¶

ACP et éboulis des valeurs propres¶

In [ ]:
#ACP avec scientistools
from scientisttools.decomposition import PCA
from scientisttools.extractfactor import get_eig
acp = PCA(normalize=True,graph=False)
acp.fit(df)

#valeurs propres
get_eig(acp)
Out[ ]:
eigenvalue difference proportion cumulative
Dim.1 4.006438 2.371438 44.515973 44.515973
Dim.2 1.634999 0.507080 18.166661 62.682634
Dim.3 1.127920 0.173256 12.532439 75.215073
Dim.4 0.954664 0.490826 10.607377 85.822450
Dim.5 0.463838 0.138707 5.153760 90.976210
Dim.6 0.325131 0.053525 3.612566 94.588776
Dim.7 0.271606 0.155314 3.017848 97.606624
Dim.8 0.116292 0.017180 1.292132 98.898757
Dim.9 0.099112 NaN 1.101243 100.000000
In [ ]:
#soit
acp.eig_[0]
Out[ ]:
array([4.00643757, 1.63499945, 1.1279195 , 0.95466396, 0.4638384 ,
       0.32513097, 0.27160633, 0.1162919 , 0.0991119 ])
In [ ]:
#screeplot
import matplotlib.pyplot as plt
import numpy
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.')
Out[ ]:
[<matplotlib.lines.Line2D at 0x28585456980>]

Seuils pour la détection des facteurs¶

In [ ]:
#screeplot avec la règle de Kaiser-Gutman
from scientisttools.pyplot import plot_eigenvalues
plot_eigenvalues(acp,choice='eigenvalue',add_kaiser=True)
In [ ]:
#seuil de Karlis - Saporta - Spinaki
plot_eigenvalues(acp,choice="eigenvalue",add_kss=True)
In [ ]:
#screeplot avec les seuils des bâtons brisés
plot_eigenvalues(acp,choice='eigenvalue',add_broken_stick=True)

Analyse parallèle¶

Perturbations aléatoires pour l'analyse parallèle¶

In [ ]:
#principe de la perturbation aléatoire
#valeurs intra-colonnes
from sklearn.utils import shuffle

#affichage des valeurs initialement
#on accède bien au vecteur numpy sous-jacent et non
#pas à la Série "pandas" qui tient compte des index
print(df.RedMeat.values,'\n')

#réordonnancement (perturbation) aléatoire des valeurs
print(shuffle(df.RedMeat.values))
[10.1  8.9 13.5  7.8  9.7 10.6  8.4  9.5 18.  10.2  5.3 13.9  9.   9.5
  9.4  6.9  6.2  6.2  7.1  9.9 13.1 17.4  9.3 11.4  4.4] 

[ 9.4  7.8  6.9 18.   9.5  9.5  5.3  9.  13.1 10.6 10.1  4.4  9.7  7.1
 13.5 17.4  8.9  9.3 10.2 13.9 11.4  6.2  9.9  8.4  6.2]
In [ ]:
#traitement du data frame
#perturbation aléatoire colonne par colonne, indépendamment les unes des autres
dfRnd = df.apply(axis=0,func=lambda x:shuffle(x.values))

#affichage
dfRnd
Out[ ]:
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr_and_Veg
Country
Albania 11.4 6.0 4.7 8.3 3.0 40.1 4.2 5.5 3.8
Austria 6.9 1.4 3.7 18.8 0.6 26.6 2.2 0.7 4.3
Belgium 7.8 6.3 3.1 17.6 4.5 19.5 5.9 1.5 2.4
Bulgaria 9.7 9.3 1.5 25.8 2.0 18.6 3.1 1.3 4.0
Czechoslovakia 13.9 11.4 4.3 11.1 2.3 29.2 4.7 5.7 6.5
Denmark 9.5 11.6 1.6 19.3 5.7 22.4 6.2 1.4 3.6
E_Germany 18.0 4.9 1.2 16.6 0.2 24.0 6.5 1.1 7.9
Finland 9.9 10.1 2.1 19.9 1.2 26.3 3.0 0.8 2.7
France 8.9 10.2 3.6 8.9 3.0 25.6 2.8 5.3 1.4
Greece 7.1 12.4 2.9 9.7 5.9 43.6 6.4 2.0 3.7
Hungary 4.4 5.1 2.8 8.6 4.3 23.0 4.8 2.1 2.9
Ireland 10.1 4.6 2.9 23.3 3.4 36.8 4.6 2.4 6.5
Italy 10.2 5.0 3.3 12.5 2.2 28.0 5.1 4.7 3.3
Netherlands 9.5 3.4 4.1 4.9 14.2 49.6 1.1 7.8 4.0
Norway 9.0 12.5 4.7 11.1 5.8 36.1 5.7 3.7 2.8
Poland 6.2 3.0 3.5 9.5 5.4 27.0 3.6 1.6 6.6
Portugal 10.6 10.0 1.1 23.8 1.0 42.3 2.1 3.4 2.0
Romania 8.4 9.9 2.8 23.4 7.5 56.7 5.2 2.4 6.7
Spain 9.4 10.8 2.7 13.7 0.3 24.6 4.8 1.8 4.9
Sweden 9.3 3.7 2.7 17.5 2.5 24.3 4.0 5.9 3.2
Switzerland 5.3 5.7 4.1 25.0 3.4 34.3 5.7 1.0 2.9
UK 6.2 14.0 0.5 24.7 2.1 41.7 5.9 1.6 4.2
USSR 17.4 13.6 3.7 20.6 9.9 21.9 0.6 4.3 4.2
W_Germany 13.1 4.7 2.7 33.7 9.7 55.9 5.0 3.4 7.2
Yugoslavia 13.5 7.8 3.1 19.5 7.0 28.1 3.7 5.4 1.7
In [ ]:
#description - les distributions individuelles sont conservées
dfRnd.describe()
Out[ ]:
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr_and_Veg
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 9.828000 7.896000 2.936000 17.112000 4.284000 32.248000 4.276000 3.072000 4.136000
std 3.347078 3.694081 1.117617 7.105416 3.402533 10.974786 1.634085 1.985682 1.803903
min 4.400000 1.400000 0.500000 4.900000 0.200000 18.600000 0.600000 0.700000 1.400000
25% 7.800000 4.900000 2.700000 11.100000 2.100000 24.300000 3.100000 1.500000 2.900000
50% 9.500000 7.800000 2.900000 17.600000 3.400000 28.000000 4.700000 2.400000 3.800000
75% 10.600000 10.800000 3.700000 23.300000 5.800000 40.100000 5.700000 4.700000 4.900000
max 18.000000 14.000000 4.700000 33.700000 14.200000 56.700000 6.500000 7.800000 7.900000
In [ ]:
#heatmap des corrélations - mais les relations entre les variables non
sns.heatmap(dfRnd.corr(),vmin=-1,vmax=+1,center=0,cmap=sns.color_palette('vlag',as_cmap=True))
Out[ ]:
<Axes: >

Calculs des seuils avec l'analyse parallèle¶

In [ ]:
#nombre de répétition
nrep = 200

#préparation - matrices des valeurs propres
mat_vp = numpy.zeros(shape=(nrep,9))

#itérer nboot fois
for i in range(nrep):
    #créer une version H0 des données
    dfAlea = df.apply(axis=0,func=lambda x:shuffle(x.values))
    #lancer l'acp
    pca = PCA(normalize=True,graph=False)
    pca.fit(dfAlea)
    #récupération des val.p
    mat_vp[i,:] = pca.eig_[0]

#affichage des 5 premiers résultats
mat_vp[:5,:]
Out[ ]:
array([[2.29169811, 1.78887337, 1.41403348, 1.0176063 , 0.86197047,
        0.56080423, 0.47998312, 0.34302169, 0.24200923],
       [2.00492809, 1.66978688, 1.17539514, 1.16402752, 1.09900473,
        0.69226665, 0.45470484, 0.40877623, 0.33110992],
       [2.26842845, 1.65534782, 1.20590112, 1.0001701 , 0.87921409,
        0.71737244, 0.54501328, 0.40968995, 0.31886276],
       [2.30153973, 1.66069767, 1.30808683, 1.06596455, 0.88945257,
        0.66984338, 0.49906255, 0.40709903, 0.19825369],
       [2.25224159, 1.63364472, 1.19581738, 1.05669436, 0.92622841,
        0.77182289, 0.60094526, 0.37532032, 0.18728508]])
In [ ]:
#seuil = moyennes
moy_vp = numpy.mean(mat_vp,axis=0)
print(moy_vp)
[2.04828531 1.64488327 1.35706796 1.11713157 0.92025007 0.72333076
 0.54517965 0.39791787 0.24595354]
In [ ]:
#graphique
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.',label='Val.p - Observé')
plt.plot(numpy.arange(1,10),moy_vp,c='red',linestyle='--',label='Val.p - Moyenne RND')
plt.legend()
plt.show()

Analyse bootstrap¶

Principe du tirage avec remise¶

In [ ]:
#principe du tirage avec remise
dfBoot = df.sample(frac=1,replace=True)

#affichage
dfBoot
Out[ ]:
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr_and_Veg
Country
Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
Romania 6.2 6.3 1.5 11.1 1.0 49.6 3.1 5.3 2.8
Spain 7.1 3.4 3.1 8.6 7.0 29.2 5.7 5.9 7.2
Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
Yugoslavia 4.4 5.0 1.2 9.5 0.6 55.9 3.0 5.7 3.2
Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
Netherlands 9.5 13.6 3.6 23.4 2.5 22.4 4.2 1.8 3.7
Spain 7.1 3.4 3.1 8.6 7.0 29.2 5.7 5.9 7.2
Netherlands 9.5 13.6 3.6 23.4 2.5 22.4 4.2 1.8 3.7
W_Germany 11.4 12.5 4.1 18.8 3.4 18.6 5.2 1.5 3.8
W_Germany 11.4 12.5 4.1 18.8 3.4 18.6 5.2 1.5 3.8
Poland 6.9 10.2 2.7 19.3 3.0 36.1 5.9 2.0 6.6
Sweden 9.9 7.8 3.5 24.7 7.5 19.5 3.7 1.4 2.0
UK 17.4 5.7 4.7 20.6 4.3 24.3 4.7 3.4 3.3
Finland 9.5 4.9 2.7 33.7 5.8 26.3 5.1 1.0 1.4
Yugoslavia 4.4 5.0 1.2 9.5 0.6 55.9 3.0 5.7 3.2
W_Germany 11.4 12.5 4.1 18.8 3.4 18.6 5.2 1.5 3.8
Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
France 18.0 9.9 3.3 19.5 5.7 28.1 4.8 2.4 6.5
Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0
Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
Finland 9.5 4.9 2.7 33.7 5.8 26.3 5.1 1.0 1.4

Réalisation des calculs¶

In [ ]:
#nombre de répétition
nboot = 500

#préparation - matrices des valeurs propres
mat_boot = numpy.zeros(shape=(nboot,9))

#itérer nboot fois
for i in range(nboot):
    #créer une version H0 des données
    dfBoot = df.sample(frac=1,replace=True)
    #lancer l'acp
    pca = PCA(normalize=True,graph=False)
    pca.fit(dfBoot)
    #récupération des val.p
    mat_boot[i,:] = pca.eig_[0]

#affichage des 5 premiers résultats
mat_boot[:5,:]
Out[ ]:
array([[4.04353039, 1.53328544, 1.23689395, 0.98737355, 0.40636245,
        0.33231912, 0.251321  , 0.11617594, 0.09273816],
       [3.98986081, 1.57348471, 1.38666231, 0.84899536, 0.53824047,
        0.30683434, 0.2159775 , 0.07494269, 0.0650018 ],
       [4.80264632, 1.53365625, 1.21119162, 0.55915084, 0.38926604,
        0.2893321 , 0.09236361, 0.07035986, 0.05203337],
       [3.67182943, 2.17421019, 1.26138634, 0.82150427, 0.43123922,
        0.37154887, 0.15532703, 0.09784009, 0.01511456],
       [3.96151845, 1.8516949 , 1.35802687, 0.65134699, 0.50655024,
        0.25474757, 0.23734977, 0.12473749, 0.05402772]])
In [ ]:
#quantiles d'ordre 5% et 95%
#bornes basses et hautes
bb_boot = numpy.quantile(mat_boot,q=0.05,axis=0)
bh_boot = numpy.quantile(mat_boot,q=0.95,axis=0)

#affichages
print(bb_boot,'\n')
print(bh_boot)
[3.58440735 1.39915547 0.89028728 0.55457512 0.31259333 0.18764714
 0.09233864 0.03805161 0.01045625] 

[4.88977711 2.2379319  1.47530278 1.05533875 0.67821651 0.39396379
 0.25822621 0.13523129 0.07764064]
In [ ]:
#graphique, "intervalle de confiance" quantile
#observé
plt.plot(numpy.arange(1,10),acp.eig_[0],marker='.')
#bornes et intervalle
plt.plot(numpy.arange(1,10),bb_boot,marker='.')
plt.plot(numpy.arange(1,10),bh_boot,marker='.')
plt.fill_between(x=numpy.arange(1,10),y1=bb_boot,y2=bh_boot,alpha=0.5)
#seuil Kaiser-Gutman
plt.plot(numpy.arange(1,10),numpy.ones(9),linestyle='--',c='red')
plt.show()