#dossier de travail
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation des individus actifs
import pandas
X = pandas.read_excel("loisirs_subset.xlsx",sheet_name="data")
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7836 entries, 0 to 7835 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Reading 7836 non-null object 1 Listening_music 7836 non-null object 2 Cinema 7836 non-null object 3 Show 7836 non-null object 4 Exhibition 7836 non-null object 5 Computer 7836 non-null object 6 Sport 7836 non-null object 7 Walking 7836 non-null object 8 Travelling 7836 non-null object 9 Playing_music 7836 non-null object 10 Collecting 7836 non-null object 11 Volunteering 7836 non-null object 12 Mechanic 7836 non-null object 13 Gardening 7836 non-null object 14 Knitting 7836 non-null object 15 Cooking 7836 non-null object 16 Fishing 7836 non-null object 17 TV 7836 non-null object dtypes: object(18) memory usage: 1.1+ MB
#nombre d'obs.
n = X.shape[0]
print(n)
#nombre de var.
p = X.shape[1]
print(p)
7836 18
#installation à la volée du package (éventuellement)
#!pip install fanalysis
#importation du module
from fanalysis.mca import MCA
#instanciation
acm = MCA(row_labels=X.index,var_labels=X.columns)
#exécution
acm.fit(X.values)
#valeurs propres
acm.eig_
array([[1.94203920e-01, 8.17135274e-02, 7.14523138e-02, 6.35737241e-02, 5.88448160e-02, 5.58523058e-02, 5.55688411e-02, 5.32930394e-02, 5.27382461e-02, 4.91374174e-02, 4.67390321e-02, 4.50296758e-02, 4.38053722e-02, 4.35085505e-02, 4.09629230e-02, 3.83005299e-02, 3.72658100e-02, 3.64229806e-02, 3.53180258e-02, 3.23581436e-02, 3.05774725e-02], [1.66460503e+01, 7.00401663e+00, 6.12448404e+00, 5.44917635e+00, 5.04384137e+00, 4.78734050e+00, 4.76304352e+00, 4.56797480e+00, 4.52042110e+00, 4.21177863e+00, 4.00620275e+00, 3.85968650e+00, 3.75474619e+00, 3.72930433e+00, 3.51110768e+00, 3.28290257e+00, 3.19421229e+00, 3.12196977e+00, 3.02725935e+00, 2.77355517e+00, 2.62092621e+00], [1.66460503e+01, 2.36500669e+01, 2.97745509e+01, 3.52237273e+01, 4.02675686e+01, 4.50549091e+01, 4.98179527e+01, 5.43859275e+01, 5.89063486e+01, 6.31181272e+01, 6.71243299e+01, 7.09840164e+01, 7.47387626e+01, 7.84680670e+01, 8.19791746e+01, 8.52620772e+01, 8.84562895e+01, 9.15782593e+01, 9.46055186e+01, 9.73790738e+01, 1.00000000e+02]])
#coordonnées et infos
info_col = acm.col_topandas()
info_col.columns
Index(['col_coord_dim1', 'col_coord_dim2', 'col_coord_dim3', 'col_coord_dim4', 'col_coord_dim5', 'col_coord_dim6', 'col_coord_dim7', 'col_coord_dim8', 'col_coord_dim9', 'col_coord_dim10', 'col_coord_dim11', 'col_coord_dim12', 'col_coord_dim13', 'col_coord_dim14', 'col_coord_dim15', 'col_coord_dim16', 'col_coord_dim17', 'col_coord_dim18', 'col_coord_dim19', 'col_coord_dim20', 'col_coord_dim21', 'col_contrib_dim1', 'col_contrib_dim2', 'col_contrib_dim3', 'col_contrib_dim4', 'col_contrib_dim5', 'col_contrib_dim6', 'col_contrib_dim7', 'col_contrib_dim8', 'col_contrib_dim9', 'col_contrib_dim10', 'col_contrib_dim11', 'col_contrib_dim12', 'col_contrib_dim13', 'col_contrib_dim14', 'col_contrib_dim15', 'col_contrib_dim16', 'col_contrib_dim17', 'col_contrib_dim18', 'col_contrib_dim19', 'col_contrib_dim20', 'col_contrib_dim21', 'col_cos2_dim1', 'col_cos2_dim2', 'col_cos2_dim3', 'col_cos2_dim4', 'col_cos2_dim5', 'col_cos2_dim6', 'col_cos2_dim7', 'col_cos2_dim8', 'col_cos2_dim9', 'col_cos2_dim10', 'col_cos2_dim11', 'col_cos2_dim12', 'col_cos2_dim13', 'col_cos2_dim14', 'col_cos2_dim15', 'col_cos2_dim16', 'col_cos2_dim17', 'col_cos2_dim18', 'col_cos2_dim19', 'col_cos2_dim20', 'col_cos2_dim21'], dtype='object')
#coordoonées dans le premier plan factoriel
coord_col = info_col[['col_coord_dim1', 'col_coord_dim2']]
print(coord_col)
col_coord_dim1 col_coord_dim2 Reading_n 0.733412 -0.041586 Reading_y -0.362301 0.020543 Listening_music_n 0.839848 0.254103 Listening_music_y -0.318176 -0.096267 Cinema_n 0.527020 0.307260 Cinema_y -0.718375 -0.418822 Show_n 0.405560 0.112424 Show_y -0.943322 -0.261496 Exhibition_n 0.431890 -0.007079 Exhibition_y -0.929998 0.015243 Computer_n 0.455623 0.200060 Computer_y -0.681404 -0.299199 Sport_n 0.415821 0.181959 Sport_y -0.653901 -0.286140 Walking_n 0.412714 -0.338255 Walking_y -0.397413 0.325715 Travelling_n 0.496057 0.007750 Travelling_y -0.711117 -0.011110 Playing_music_n 0.214572 0.029402 Playing_music_y -0.972008 -0.133189 Collecting_n 0.067469 -0.057665 Collecting_y -0.559681 0.478349 Volunteering_n 0.139880 -0.045265 Volunteering_y -0.751258 0.243108 Mechanic_n 0.307265 -0.345452 Mechanic_y -0.394287 0.443290 Gardening_n 0.177713 -0.553866 Gardening_y -0.261996 0.816546 Knitting_n 0.050650 -0.179588 Knitting_y -0.252092 0.893832 Cooking_n 0.311962 -0.342517 Cooking_y -0.378000 0.415023 Fishing_n -0.003218 -0.106732 Fishing_y 0.024103 0.799389 TV_n0 0.473323 -0.317982 TV_n1 -0.271869 -0.137644 TV_n2 -0.175706 0.109968 TV_n3 -0.028438 0.218662 TV_n4 0.132535 -0.060858
#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-1.2,+1.2,-1.2,+1.2])
ax.plot([-1.2,+1.2],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-1.2,+1.2],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités")
for x,y,lbl in zip(coord_col.iloc[:,0],coord_col.iloc[:,1],coord_col.index):
ax.text(x,y,lbl,horizontalalignment='center',verticalalignment='center',fontsize=7)
plt.show()
#fréquences des modalités
acm.c_
array([[2591., 5245., 2153., 5683., 4520., 3316., 5480., 2356., 5351., 2485., 4696., 3140., 4790., 3046., 3844., 3992., 4616., 3220., 6419., 1417., 6993., 843., 6606., 1230., 4404., 3432., 4669., 3167., 6525., 1311., 4293., 3543., 6913., 923., 951., 1157., 2040., 1652., 2036.]])
#vérification, première variable
import numpy
numpy.unique(X.Reading.values,return_counts=True)
(array(['n', 'y'], dtype=object), array([2591, 5245], dtype=int64))
#dernière variable
numpy.unique(X.TV.values,return_counts=True)
(array(['n0', 'n1', 'n2', 'n3', 'n4'], dtype=object), array([ 951, 1157, 2040, 1652, 2036], dtype=int64))
#MASS = poids relatif des modalités
#Livre Analyse Factorielle, page 297
#Qui constitue aussi le profil moyen : Livre, page 290
col_mass = acm.c_/(n*p)
print(col_mass)
[[0.01836963 0.03718592 0.01526431 0.04029125 0.03204583 0.02350973 0.03885202 0.01670353 0.03793744 0.01761812 0.03329363 0.02226193 0.03396007 0.02159549 0.02725313 0.02830242 0.03272645 0.02282911 0.04550933 0.01004623 0.04957887 0.00597669 0.04683512 0.00872044 0.03122341 0.02433214 0.03310221 0.02245335 0.04626085 0.00929471 0.03043645 0.02511911 0.04901168 0.00654387 0.00674239 0.00820288 0.01446316 0.01171232 0.0144348 ]]
#pas de warning
import warnings
warnings.filterwarnings(action='ignore')
#K-Means de scikit-learn - mettons 3 classes
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,random_state=0)
#entraînement - données pondérées avec l'option sample_weight !!!
km.fit(X=coord_col,sample_weight=col_mass[0])
#info sur les centres de groupes
moyennes = km.cluster_centers_
print(moyennes)
[[ 0.29545093 -0.06231481] [-0.60880095 -0.13321424] [-0.30111167 0.46770847]]
#affichage des labels
print(km.labels_)
[0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 2 0 1 0 1 0 2 0 1 0 2 0 2 0 2 0 2 0 2 0 1 2 2 0]
#coordonnées avec les groupes
coord_grp = coord_col.copy()
coord_grp["groupes"] = km.labels_
#affichage de contrôle
coord_grp.head()
col_coord_dim1 | col_coord_dim2 | groupes | |
---|---|---|---|
Reading_n | 0.733412 | -0.041586 | 0 |
Reading_y | -0.362301 | 0.020543 | 1 |
Listening_music_n | 0.839848 | 0.254103 | 0 |
Listening_music_y | -0.318176 | -0.096267 | 1 |
Cinema_n | 0.527020 | 0.307260 | 0 |
#gestion des groupes
gb = coord_grp.groupby('groupes')
#affichage par indice
for k in gb.indices.keys():
print("\nGroupe : ",k)
print(coord_grp.index[gb.indices[k]])
Groupe : 0 Index(['Reading_n', 'Listening_music_n', 'Cinema_n', 'Show_n', 'Exhibition_n', 'Computer_n', 'Sport_n', 'Walking_n', 'Travelling_n', 'Playing_music_n', 'Collecting_n', 'Volunteering_n', 'Mechanic_n', 'Gardening_n', 'Knitting_n', 'Cooking_n', 'Fishing_n', 'TV_n0', 'TV_n4'], dtype='object') Groupe : 1 Index(['Reading_y', 'Listening_music_y', 'Cinema_y', 'Show_y', 'Exhibition_y', 'Computer_y', 'Sport_y', 'Travelling_y', 'Playing_music_y', 'Volunteering_y', 'TV_n1'], dtype='object') Groupe : 2 Index(['Walking_y', 'Collecting_y', 'Mechanic_y', 'Gardening_y', 'Knitting_y', 'Cooking_y', 'Fishing_y', 'TV_n2', 'TV_n3'], dtype='object')
#graphique
import seaborn
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-1.2,+1.2,-1.2,+1.2])
ax.plot([-1.2,+1.2],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-1.2,+1.2],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités - Groupes K-Means")
#couleurs pour les groupes
couleurs = ['orangered','seagreen','dodgerblue']
#pour chaque clé de groupe
for k in gb.indices.keys():
#placer le barycentre associé
plt.plot(moyennes[k,0],moyennes[k,1],marker="*",markersize=8,color=couleurs[k])
#pour chaque individu du groupe
for i in gb.indices[k]:
ax.text(coord_col.iloc[i,0],coord_col.iloc[i,1],coord_col.index[i],
color=couleurs[k],horizontalalignment='center',verticalalignment='center',fontsize=7)
plt.show()