#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#tableau de données - fréquentation de mes pages en Nov 2017
#cf. tuto => http://tutoriels-data-science.blogspot.com/p/tutoriels-en-francais.html#5640945788359221641
import pandas
D = pandas.read_excel('FrequentationNov2017.xlsx',index_col=0)
D
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | 144 | 51 | 24 | 30 | 57 | 170 | 48 | 69 | 30 |
Belgique | 37 | 15 | 20 | 9 | 32 | 58 | 19 | 10 | 5 |
CoteDIvoire | 62 | 16 | 8 | 14 | 26 | 48 | 4 | 16 | 20 |
France | 1561 | 440 | 508 | 519 | 950 | 1609 | 336 | 614 | 316 |
Maroc | 244 | 92 | 74 | 54 | 67 | 140 | 61 | 78 | 37 |
Senegal | 33 | 26 | 14 | 9 | 10 | 55 | 13 | 9 | 16 |
Tunisia | 183 | 102 | 33 | 51 | 32 | 160 | 69 | 52 | 15 |
Canada | 45 | 10 | 27 | 13 | 25 | 43 | 29 | 23 | 3 |
#lignes actives - autre que Canada
DAct = D.loc[D.index != 'Canada',:]
DAct
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | 144 | 51 | 24 | 30 | 57 | 170 | 48 | 69 | 30 |
Belgique | 37 | 15 | 20 | 9 | 32 | 58 | 19 | 10 | 5 |
CoteDIvoire | 62 | 16 | 8 | 14 | 26 | 48 | 4 | 16 | 20 |
France | 1561 | 440 | 508 | 519 | 950 | 1609 | 336 | 614 | 316 |
Maroc | 244 | 92 | 74 | 54 | 67 | 140 | 61 | 78 | 37 |
Senegal | 33 | 26 | 14 | 9 | 10 | 55 | 13 | 9 | 16 |
Tunisia | 183 | 102 | 33 | 51 | 32 | 160 | 69 | 52 | 15 |
#profil ligne
profil_ligne = DAct.divide(DAct.sum(axis=1),axis=0)
profil_ligne
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | 0.231140 | 0.081862 | 0.038523 | 0.048154 | 0.091493 | 0.272873 | 0.077047 | 0.110754 | 0.048154 |
Belgique | 0.180488 | 0.073171 | 0.097561 | 0.043902 | 0.156098 | 0.282927 | 0.092683 | 0.048780 | 0.024390 |
CoteDIvoire | 0.289720 | 0.074766 | 0.037383 | 0.065421 | 0.121495 | 0.224299 | 0.018692 | 0.074766 | 0.093458 |
France | 0.227783 | 0.064205 | 0.074128 | 0.075733 | 0.138625 | 0.234788 | 0.049030 | 0.089596 | 0.046111 |
Maroc | 0.288076 | 0.108619 | 0.087367 | 0.063754 | 0.079103 | 0.165289 | 0.072019 | 0.092090 | 0.043684 |
Senegal | 0.178378 | 0.140541 | 0.075676 | 0.048649 | 0.054054 | 0.297297 | 0.070270 | 0.048649 | 0.086486 |
Tunisia | 0.262554 | 0.146341 | 0.047346 | 0.073171 | 0.045911 | 0.229555 | 0.098996 | 0.074605 | 0.021521 |
#profil marginal des pages
import numpy
DAct.sum(axis=0)/numpy.sum(DAct.values)
DataScience 0.235245 Econometrie 0.077099 RegLogistique 0.070761 Excel 0.071280 Python 0.121987 Prog_R 0.232751 TanagraFR 0.057149 Tutoriels 0.088113 MasterSISE 0.045615 dtype: float64
#profil colonne
profil_colonne = DAct.divide(DAct.sum(axis=0),axis=1)
profil_colonne
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | 0.063604 | 0.068733 | 0.035242 | 0.043732 | 0.048552 | 0.075893 | 0.087273 | 0.081368 | 0.068337 |
Belgique | 0.016343 | 0.020216 | 0.029369 | 0.013120 | 0.027257 | 0.025893 | 0.034545 | 0.011792 | 0.011390 |
CoteDIvoire | 0.027385 | 0.021563 | 0.011747 | 0.020408 | 0.022147 | 0.021429 | 0.007273 | 0.018868 | 0.045558 |
France | 0.689488 | 0.592992 | 0.745962 | 0.756560 | 0.809199 | 0.718304 | 0.610909 | 0.724057 | 0.719818 |
Maroc | 0.107774 | 0.123989 | 0.108664 | 0.078717 | 0.057070 | 0.062500 | 0.110909 | 0.091981 | 0.084282 |
Senegal | 0.014576 | 0.035040 | 0.020558 | 0.013120 | 0.008518 | 0.024554 | 0.023636 | 0.010613 | 0.036446 |
Tunisia | 0.080830 | 0.137466 | 0.048458 | 0.074344 | 0.027257 | 0.071429 | 0.125455 | 0.061321 | 0.034169 |
#profil marginal des sessions par pays
DAct.sum(axis=1)/numpy.sum(DAct.values)
Pays Algerie 0.064734 Belgique 0.021301 CoteDIvoire 0.022236 France 0.712074 Maroc 0.088009 Senegal 0.019223 Tunisia 0.072423 dtype: float64
#package fanalysis
#!pip install fanalysis
#importation de la classe de calcul
from fanalysis.ca import CA
#instanciation et entraînement
afc = CA(row_labels=DAct.index,col_labels=DAct.columns,stats=True)
afc.fit(DAct.values)
#valeurs propres
afc.eig_
array([[2.10287310e-02, 4.90188473e-03, 3.94268337e-03, 2.44280848e-03, 1.16914646e-03, 4.44538942e-04], [6.19771863e+01, 1.44471401e+01, 1.16201221e+01, 7.19959736e+00, 3.44578128e+00, 1.31017287e+00], [6.19771863e+01, 7.64243264e+01, 8.80444485e+01, 9.52440459e+01, 9.86898271e+01, 1.00000000e+02]])
#valeurs propres
afc.plot_eigenvalues()
#récupération des infos
info_lig = afc.row_topandas()
info_lig.columns
Index(['row_coord_dim1', 'row_coord_dim2', 'row_coord_dim3', 'row_coord_dim4', 'row_coord_dim5', 'row_coord_dim6', 'row_contrib_dim1', 'row_contrib_dim2', 'row_contrib_dim3', 'row_contrib_dim4', 'row_contrib_dim5', 'row_contrib_dim6', 'row_cos2_dim1', 'row_cos2_dim2', 'row_cos2_dim3', 'row_cos2_dim4', 'row_cos2_dim5', 'row_cos2_dim6'], dtype='object')
#carte des points lignes
afc.mapping_row(num_x_axis=1,num_y_axis=2)
#coordonnées des points-lignes dans le premier plan
coord_lig = afc.row_coord_[:,:2]
coord_lig
array([[ 0.10890632, -0.12659145], [-0.01167707, -0.18221523], [-0.06111603, 0.06493061], [-0.07837983, 0.00278858], [ 0.19379579, 0.16278733], [ 0.23769666, -0.18592683], [ 0.39690394, -0.02908044]])
#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.45,+0.45,-0.45,+0.45])
ax.plot([-0.45,+0.45],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.45,+0.45],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités lignes")
for i in range(DAct.shape[0]):
ax.text(coord_lig[i,0],coord_lig[i,1],DAct.index[i])
plt.show()
#contributions - 1er facteur
afc.plot_row_contrib(num_axis=1)
#contributions - 2e facteur
afc.plot_row_contrib(num_axis=2)
#directement la carte dans le plan
afc.mapping_col(num_x_axis=1,num_y_axis=2)
#coordonnées des points-colonnes dans le premier plan
coord_col = afc.col_coord_[:,:2]
#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.35,+0.35,-0.35,+0.35])
ax.plot([-0.35,+0.35],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.35,+0.35],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités colonnes")
for i in range(DAct.shape[1]):
ax.text(coord_col[i,0],coord_col[i,1],DAct.columns[i])
plt.show()
#Représentation simultanée
afc.mapping(num_x_axis=1,num_y_axis=2)
#KHI-2 du tableau de contingence
from scipy.stats import chi2_contingency
res = chi2_contingency(DAct,correction=False)
res
Chi2ContingencyResult(statistic=326.5403279528158, pvalue=4.3900928721570904e-43, dof=48, expected_freq=array([[ 146.55777224, 48.03262677, 44.08385287, 44.40752286, 75.99771405, 145.00415628, 35.60369909, 54.89443059, 28.41822527], [ 48.22527016, 15.80527847, 14.50592269, 14.61242727, 25.00727348, 47.71404821, 11.71550291, 18.06317539, 9.35110141], [ 50.34247714, 16.49916874, 15.14276808, 15.25394846, 26.10515378, 49.80881131, 12.22984206, 18.85619285, 9.76163757], [1612.1354946 , 528.35889443, 484.92238155, 488.48275145, 835.97485453, 1595.04571904, 391.64068994, 603.83873649, 312.60047797], [ 199.25270158, 65.3027847 , 59.93422693, 60.37427265, 103.32273483, 197.14048213, 48.40502909, 74.63175395, 38.63601413], [ 43.52036575, 14.26330008, 13.09071072, 13.18682461, 22.56753948, 43.05901912, 10.57252702, 16.30091438, 8.43879884], [ 163.96591854, 53.7379468 , 49.32013716, 49.6822527 , 85.02472984, 162.22776392, 39.83270989, 61.41479634, 31.7937448 ]]))
#info dispo - khi2
res.statistic
326.5403279528158
#décomposée comme suit
contribKhi2 = ((DAct.values - res.expected_freq)**2)/res.expected_freq
#vérification de la somme
numpy.sum(contribKhi2)
326.5403279528158
#en fraction de l'info disponible
frac_contrib = contribKhi2/res.statistic
df_contrib = pandas.DataFrame(frac_contrib,index=DAct.index,columns=DAct.columns)
df_contrib
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | 0.000137 | 0.000561 | 0.028021 | 0.014315 | 0.014543 | 0.013195 | 0.013218 | 0.011100 | 0.000270 |
Belgique | 0.008002 | 0.000126 | 0.006372 | 0.006601 | 0.005988 | 0.006791 | 0.013871 | 0.011023 | 0.006200 |
CoteDIvoire | 0.008267 | 0.000046 | 0.010318 | 0.000316 | 0.000001 | 0.000201 | 0.016960 | 0.001325 | 0.032885 |
France | 0.004967 | 0.045252 | 0.003363 | 0.005839 | 0.047629 | 0.000374 | 0.024208 | 0.000524 | 0.000113 |
Maroc | 0.030775 | 0.033424 | 0.010109 | 0.002061 | 0.039104 | 0.050720 | 0.010036 | 0.000466 | 0.000212 |
Senegal | 0.007788 | 0.029576 | 0.000193 | 0.004071 | 0.021433 | 0.010141 | 0.001707 | 0.010014 | 0.020747 |
Tunisia | 0.006767 | 0.132738 | 0.016538 | 0.000107 | 0.101269 | 0.000094 | 0.065406 | 0.004420 | 0.027165 |
#représenté sous forme de heatmap
import seaborn as sns
sns.heatmap(df_contrib,vmin=0,vmax=numpy.max(frac_contrib),cmap="Greens")
<Axes: ylabel='Pays'>
#résidus standardisés
res_std = (DAct.values-res.expected_freq)/numpy.sqrt(res.expected_freq)
df_res_std = pandas.DataFrame(res_std,index=DAct.index,columns=DAct.columns)
df_res_std
DataScience | Econometrie | RegLogistique | Excel | Python | Prog_R | TanagraFR | Tutoriels | MasterSISE | |
---|---|---|---|---|---|---|---|---|---|
Pays | |||||||||
Algerie | -0.211280 | 0.428158 | -3.024874 | -2.162027 | -2.179220 | 2.075762 | 2.077517 | 1.903823 | 0.296720 |
Belgique | -1.616440 | -0.202556 | 1.442521 | -1.468215 | 1.398342 | 1.489091 | 2.128233 | -1.897182 | -1.422879 |
CoteDIvoire | 1.643005 | -0.122890 | -1.835540 | -0.321062 | -0.020581 | -0.256295 | -2.353320 | -0.657749 | 3.276945 |
France | -1.273567 | -3.844021 | 1.047985 | 1.380768 | 3.943701 | 0.349398 | -2.811568 | 0.413511 | 0.192275 |
Maroc | 3.170040 | 3.303696 | 1.816880 | -0.820360 | -3.573391 | -4.069640 | 1.810306 | 0.389890 | -0.263203 |
Senegal | -1.594720 | 3.107678 | 0.251316 | -1.152961 | -2.645502 | 1.819734 | 0.746560 | -1.808303 | 2.602858 |
Tunisia | 1.486467 | 6.583628 | -2.323869 | 0.186953 | -5.750503 | -0.174907 | 4.621428 | -1.201363 | -2.978357 |
#sous forme de heatmap
sns.heatmap(df_res_std,center=0.0,cmap=sns.diverging_palette(10,250,as_cmap=True))
<Axes: ylabel='Pays'>
#récupération de Canada
canada = D.loc['Canada',:]
canada
DataScience 45 Econometrie 10 RegLogistique 27 Excel 13 Python 25 Prog_R 43 TanagraFR 29 Tutoriels 23 MasterSISE 3 Name: Canada, dtype: int64
#calcul des coordonnées factorielles
coord_canada = afc.transform([canada.values])
coord_canada
array([[ 0.06941696, -0.00143052, -0.32631885, -0.01396041, 0.23148291, 0.04045268]])
coord_canada[0]
array([ 0.06941696, -0.00143052, -0.32631885, -0.01396041, 0.23148291, 0.04045268])
#positionnement dans le plan
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-0.45,+0.45,-0.45,+0.45])
ax.plot([-0.45,+0.45],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-0.45,+0.45],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités lignes + Canada")
for i in range(DAct.shape[0]):
ax.text(coord_lig[i,0],coord_lig[i,1],DAct.index[i])
#et donc le Canada
ax.text(coord_canada[0][0],coord_canada[0][1],"Canada",color="Blue")
plt.show()