Importation et inpection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#tableau de données - fréquentation de mes pages en Nov 2017
#cf. tuto => http://tutoriels-data-science.blogspot.com/p/tutoriels-en-francais.html#5640945788359221641
import pandas
D = pandas.read_excel('FrequentationNov2017.xlsx',index_col=0)
D
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie 144 51 24 30 57 170 48 69 30
Belgique 37 15 20 9 32 58 19 10 5
CoteDIvoire 62 16 8 14 26 48 4 16 20
France 1561 440 508 519 950 1609 336 614 316
Maroc 244 92 74 54 67 140 61 78 37
Senegal 33 26 14 9 10 55 13 9 16
Tunisia 183 102 33 51 32 160 69 52 15
Canada 45 10 27 13 25 43 29 23 3
In [ ]:
#lignes actives - autre que Canada
DAct = D.loc[D.index != 'Canada',:]
DAct
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie 144 51 24 30 57 170 48 69 30
Belgique 37 15 20 9 32 58 19 10 5
CoteDIvoire 62 16 8 14 26 48 4 16 20
France 1561 440 508 519 950 1609 336 614 316
Maroc 244 92 74 54 67 140 61 78 37
Senegal 33 26 14 9 10 55 13 9 16
Tunisia 183 102 33 51 32 160 69 52 15
In [ ]:
#profil ligne
profil_ligne = DAct.divide(DAct.sum(axis=1),axis=0)
profil_ligne
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie 0.231140 0.081862 0.038523 0.048154 0.091493 0.272873 0.077047 0.110754 0.048154
Belgique 0.180488 0.073171 0.097561 0.043902 0.156098 0.282927 0.092683 0.048780 0.024390
CoteDIvoire 0.289720 0.074766 0.037383 0.065421 0.121495 0.224299 0.018692 0.074766 0.093458
France 0.227783 0.064205 0.074128 0.075733 0.138625 0.234788 0.049030 0.089596 0.046111
Maroc 0.288076 0.108619 0.087367 0.063754 0.079103 0.165289 0.072019 0.092090 0.043684
Senegal 0.178378 0.140541 0.075676 0.048649 0.054054 0.297297 0.070270 0.048649 0.086486
Tunisia 0.262554 0.146341 0.047346 0.073171 0.045911 0.229555 0.098996 0.074605 0.021521
In [ ]:
#profil marginal des pages
import numpy
DAct.sum(axis=0)/numpy.sum(DAct.values)
Out[ ]:
DataScience      0.235245
Econometrie      0.077099
RegLogistique    0.070761
Excel            0.071280
Python           0.121987
Prog_R           0.232751
TanagraFR        0.057149
Tutoriels        0.088113
MasterSISE       0.045615
dtype: float64
In [ ]:
#profil colonne
profil_colonne = DAct.divide(DAct.sum(axis=0),axis=1)
profil_colonne
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie 0.063604 0.068733 0.035242 0.043732 0.048552 0.075893 0.087273 0.081368 0.068337
Belgique 0.016343 0.020216 0.029369 0.013120 0.027257 0.025893 0.034545 0.011792 0.011390
CoteDIvoire 0.027385 0.021563 0.011747 0.020408 0.022147 0.021429 0.007273 0.018868 0.045558
France 0.689488 0.592992 0.745962 0.756560 0.809199 0.718304 0.610909 0.724057 0.719818
Maroc 0.107774 0.123989 0.108664 0.078717 0.057070 0.062500 0.110909 0.091981 0.084282
Senegal 0.014576 0.035040 0.020558 0.013120 0.008518 0.024554 0.023636 0.010613 0.036446
Tunisia 0.080830 0.137466 0.048458 0.074344 0.027257 0.071429 0.125455 0.061321 0.034169
In [ ]:
#profil marginal des sessions par pays
DAct.sum(axis=1)/numpy.sum(DAct.values)
Out[ ]:
Pays
Algerie        0.064734
Belgique       0.021301
CoteDIvoire    0.022236
France         0.712074
Maroc          0.088009
Senegal        0.019223
Tunisia        0.072423
dtype: float64

Analyse factorielle des correspondances¶

In [ ]:
#package fanalysis
#!pip install fanalysis
In [ ]:
#importation de la classe de calcul
from fanalysis.ca import CA

#instanciation et entraînement
afc = CA(row_labels=DAct.index,col_labels=DAct.columns,stats=True)
afc.fit(DAct.values)

#valeurs propres
afc.eig_
Out[ ]:
array([[2.10287310e-02, 4.90188473e-03, 3.94268337e-03, 2.44280848e-03,
        1.16914646e-03, 4.44538942e-04],
       [6.19771863e+01, 1.44471401e+01, 1.16201221e+01, 7.19959736e+00,
        3.44578128e+00, 1.31017287e+00],
       [6.19771863e+01, 7.64243264e+01, 8.80444485e+01, 9.52440459e+01,
        9.86898271e+01, 1.00000000e+02]])
In [ ]:
#valeurs propres
afc.plot_eigenvalues()

Analyse des modalités lignes¶

In [ ]:
#récupération des infos
info_lig = afc.row_topandas()
info_lig.columns
Out[ ]:
Index(['row_coord_dim1', 'row_coord_dim2', 'row_coord_dim3', 'row_coord_dim4',
       'row_coord_dim5', 'row_coord_dim6', 'row_contrib_dim1',
       'row_contrib_dim2', 'row_contrib_dim3', 'row_contrib_dim4',
       'row_contrib_dim5', 'row_contrib_dim6', 'row_cos2_dim1',
       'row_cos2_dim2', 'row_cos2_dim3', 'row_cos2_dim4', 'row_cos2_dim5',
       'row_cos2_dim6'],
      dtype='object')
In [ ]:
#carte des points lignes
afc.mapping_row(num_x_axis=1,num_y_axis=2)
In [ ]:
#coordonnées des points-lignes dans le premier plan
coord_lig = afc.row_coord_[:,:2]
coord_lig
Out[ ]:
array([[ 0.10890632, -0.12659145],
       [-0.01167707, -0.18221523],
       [-0.06111603,  0.06493061],
       [-0.07837983,  0.00278858],
       [ 0.19379579,  0.16278733],
       [ 0.23769666, -0.18592683],
       [ 0.39690394, -0.02908044]])
In [ ]:
#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.45,+0.45,-0.45,+0.45])
ax.plot([-0.45,+0.45],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.45,+0.45],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités lignes")
for i in range(DAct.shape[0]):
    ax.text(coord_lig[i,0],coord_lig[i,1],DAct.index[i])
 
plt.show()
In [ ]:
#contributions - 1er facteur
afc.plot_row_contrib(num_axis=1)
In [ ]:
#contributions - 2e facteur
afc.plot_row_contrib(num_axis=2)

Analyse des modalités colonnes¶

In [ ]:
#directement la carte dans le plan
afc.mapping_col(num_x_axis=1,num_y_axis=2)
In [ ]:
#coordonnées des points-colonnes dans le premier plan
coord_col = afc.col_coord_[:,:2]

#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.35,+0.35,-0.35,+0.35])
ax.plot([-0.35,+0.35],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.35,+0.35],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités colonnes")
for i in range(DAct.shape[1]):
    ax.text(coord_col[i,0],coord_col[i,1],DAct.columns[i])
 
plt.show()

Associations lignes-colonnes¶

In [ ]:
#Représentation simultanée
afc.mapping(num_x_axis=1,num_y_axis=2)

Décomposition du KHI-2¶

In [ ]:
#KHI-2 du tableau de contingence
from scipy.stats import chi2_contingency
res = chi2_contingency(DAct,correction=False)
res
Out[ ]:
Chi2ContingencyResult(statistic=326.5403279528158, pvalue=4.3900928721570904e-43, dof=48, expected_freq=array([[ 146.55777224,   48.03262677,   44.08385287,   44.40752286,
          75.99771405,  145.00415628,   35.60369909,   54.89443059,
          28.41822527],
       [  48.22527016,   15.80527847,   14.50592269,   14.61242727,
          25.00727348,   47.71404821,   11.71550291,   18.06317539,
           9.35110141],
       [  50.34247714,   16.49916874,   15.14276808,   15.25394846,
          26.10515378,   49.80881131,   12.22984206,   18.85619285,
           9.76163757],
       [1612.1354946 ,  528.35889443,  484.92238155,  488.48275145,
         835.97485453, 1595.04571904,  391.64068994,  603.83873649,
         312.60047797],
       [ 199.25270158,   65.3027847 ,   59.93422693,   60.37427265,
         103.32273483,  197.14048213,   48.40502909,   74.63175395,
          38.63601413],
       [  43.52036575,   14.26330008,   13.09071072,   13.18682461,
          22.56753948,   43.05901912,   10.57252702,   16.30091438,
           8.43879884],
       [ 163.96591854,   53.7379468 ,   49.32013716,   49.6822527 ,
          85.02472984,  162.22776392,   39.83270989,   61.41479634,
          31.7937448 ]]))
In [ ]:
#info dispo - khi2
res.statistic
Out[ ]:
326.5403279528158
In [ ]:
#décomposée comme suit
contribKhi2 = ((DAct.values - res.expected_freq)**2)/res.expected_freq

#vérification de la somme
numpy.sum(contribKhi2)
Out[ ]:
326.5403279528158
In [ ]:
#en fraction de l'info disponible
frac_contrib = contribKhi2/res.statistic
df_contrib = pandas.DataFrame(frac_contrib,index=DAct.index,columns=DAct.columns)
df_contrib
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie 0.000137 0.000561 0.028021 0.014315 0.014543 0.013195 0.013218 0.011100 0.000270
Belgique 0.008002 0.000126 0.006372 0.006601 0.005988 0.006791 0.013871 0.011023 0.006200
CoteDIvoire 0.008267 0.000046 0.010318 0.000316 0.000001 0.000201 0.016960 0.001325 0.032885
France 0.004967 0.045252 0.003363 0.005839 0.047629 0.000374 0.024208 0.000524 0.000113
Maroc 0.030775 0.033424 0.010109 0.002061 0.039104 0.050720 0.010036 0.000466 0.000212
Senegal 0.007788 0.029576 0.000193 0.004071 0.021433 0.010141 0.001707 0.010014 0.020747
Tunisia 0.006767 0.132738 0.016538 0.000107 0.101269 0.000094 0.065406 0.004420 0.027165
In [ ]:
#représenté sous forme de heatmap
import seaborn as sns
sns.heatmap(df_contrib,vmin=0,vmax=numpy.max(frac_contrib),cmap="Greens")
Out[ ]:
<Axes: ylabel='Pays'>

Résidus standardisés¶

In [ ]:
#résidus standardisés
res_std = (DAct.values-res.expected_freq)/numpy.sqrt(res.expected_freq)
df_res_std = pandas.DataFrame(res_std,index=DAct.index,columns=DAct.columns)
df_res_std
Out[ ]:
DataScience Econometrie RegLogistique Excel Python Prog_R TanagraFR Tutoriels MasterSISE
Pays
Algerie -0.211280 0.428158 -3.024874 -2.162027 -2.179220 2.075762 2.077517 1.903823 0.296720
Belgique -1.616440 -0.202556 1.442521 -1.468215 1.398342 1.489091 2.128233 -1.897182 -1.422879
CoteDIvoire 1.643005 -0.122890 -1.835540 -0.321062 -0.020581 -0.256295 -2.353320 -0.657749 3.276945
France -1.273567 -3.844021 1.047985 1.380768 3.943701 0.349398 -2.811568 0.413511 0.192275
Maroc 3.170040 3.303696 1.816880 -0.820360 -3.573391 -4.069640 1.810306 0.389890 -0.263203
Senegal -1.594720 3.107678 0.251316 -1.152961 -2.645502 1.819734 0.746560 -1.808303 2.602858
Tunisia 1.486467 6.583628 -2.323869 0.186953 -5.750503 -0.174907 4.621428 -1.201363 -2.978357
In [ ]:
#sous forme de heatmap
sns.heatmap(df_res_std,center=0.0,cmap=sns.diverging_palette(10,250,as_cmap=True))
Out[ ]:
<Axes: ylabel='Pays'>

Point ligne supplémentaire¶

In [ ]:
#récupération de Canada
canada = D.loc['Canada',:]
canada
Out[ ]:
DataScience      45
Econometrie      10
RegLogistique    27
Excel            13
Python           25
Prog_R           43
TanagraFR        29
Tutoriels        23
MasterSISE        3
Name: Canada, dtype: int64
In [ ]:
#calcul des coordonnées factorielles
coord_canada = afc.transform([canada.values])
coord_canada
Out[ ]:
array([[ 0.06941696, -0.00143052, -0.32631885, -0.01396041,  0.23148291,
         0.04045268]])
In [ ]:
coord_canada[0]
Out[ ]:
array([ 0.06941696, -0.00143052, -0.32631885, -0.01396041,  0.23148291,
        0.04045268])
In [ ]:
#positionnement dans le plan
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.45,+0.45,-0.45,+0.45])
ax.plot([-0.45,+0.45],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.45,+0.45],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités lignes + Canada")
for i in range(DAct.shape[0]):
    ax.text(coord_lig[i,0],coord_lig[i,1],DAct.index[i])
#et donc le Canada
ax.text(coord_canada[0][0],coord_canada[0][1],"Canada",color="Blue")
plt.show()