Importation et inspection des données¶

In [ ]:
#dossier de travail
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation des individus actifs
import pandas
X = pandas.read_excel("loisirs_subset.xlsx",sheet_name="data")
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7836 entries, 0 to 7835
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Reading          7836 non-null   object
 1   Listening_music  7836 non-null   object
 2   Cinema           7836 non-null   object
 3   Show             7836 non-null   object
 4   Exhibition       7836 non-null   object
 5   Computer         7836 non-null   object
 6   Sport            7836 non-null   object
 7   Walking          7836 non-null   object
 8   Travelling       7836 non-null   object
 9   Playing_music    7836 non-null   object
 10  Collecting       7836 non-null   object
 11  Volunteering     7836 non-null   object
 12  Mechanic         7836 non-null   object
 13  Gardening        7836 non-null   object
 14  Knitting         7836 non-null   object
 15  Cooking          7836 non-null   object
 16  Fishing          7836 non-null   object
 17  TV               7836 non-null   object
dtypes: object(18)
memory usage: 1.1+ MB
In [ ]:
#nombre d'obs.
n = X.shape[0]
print(n)

#nombre de var.
p = X.shape[1]
print(p)
7836
18

ACM avec fanalysis¶

In [ ]:
#installation à la volée du package (éventuellement)
#!pip install fanalysis
In [ ]:
#importation du module
from fanalysis.mca import MCA

#instanciation
acm = MCA(row_labels=X.index,var_labels=X.columns)

#exécution
acm.fit(X.values)

#valeurs propres
acm.eig_
Out[ ]:
array([[1.94203920e-01, 8.17135274e-02, 7.14523138e-02, 6.35737241e-02,
        5.88448160e-02, 5.58523058e-02, 5.55688411e-02, 5.32930394e-02,
        5.27382461e-02, 4.91374174e-02, 4.67390321e-02, 4.50296758e-02,
        4.38053722e-02, 4.35085505e-02, 4.09629230e-02, 3.83005299e-02,
        3.72658100e-02, 3.64229806e-02, 3.53180258e-02, 3.23581436e-02,
        3.05774725e-02],
       [1.66460503e+01, 7.00401663e+00, 6.12448404e+00, 5.44917635e+00,
        5.04384137e+00, 4.78734050e+00, 4.76304352e+00, 4.56797480e+00,
        4.52042110e+00, 4.21177863e+00, 4.00620275e+00, 3.85968650e+00,
        3.75474619e+00, 3.72930433e+00, 3.51110768e+00, 3.28290257e+00,
        3.19421229e+00, 3.12196977e+00, 3.02725935e+00, 2.77355517e+00,
        2.62092621e+00],
       [1.66460503e+01, 2.36500669e+01, 2.97745509e+01, 3.52237273e+01,
        4.02675686e+01, 4.50549091e+01, 4.98179527e+01, 5.43859275e+01,
        5.89063486e+01, 6.31181272e+01, 6.71243299e+01, 7.09840164e+01,
        7.47387626e+01, 7.84680670e+01, 8.19791746e+01, 8.52620772e+01,
        8.84562895e+01, 9.15782593e+01, 9.46055186e+01, 9.73790738e+01,
        1.00000000e+02]])

Informations sur les points-modalités¶

In [ ]:
#coordonnées et infos
info_col = acm.col_topandas()
info_col.columns
Out[ ]:
Index(['col_coord_dim1', 'col_coord_dim2', 'col_coord_dim3', 'col_coord_dim4',
       'col_coord_dim5', 'col_coord_dim6', 'col_coord_dim7', 'col_coord_dim8',
       'col_coord_dim9', 'col_coord_dim10', 'col_coord_dim11',
       'col_coord_dim12', 'col_coord_dim13', 'col_coord_dim14',
       'col_coord_dim15', 'col_coord_dim16', 'col_coord_dim17',
       'col_coord_dim18', 'col_coord_dim19', 'col_coord_dim20',
       'col_coord_dim21', 'col_contrib_dim1', 'col_contrib_dim2',
       'col_contrib_dim3', 'col_contrib_dim4', 'col_contrib_dim5',
       'col_contrib_dim6', 'col_contrib_dim7', 'col_contrib_dim8',
       'col_contrib_dim9', 'col_contrib_dim10', 'col_contrib_dim11',
       'col_contrib_dim12', 'col_contrib_dim13', 'col_contrib_dim14',
       'col_contrib_dim15', 'col_contrib_dim16', 'col_contrib_dim17',
       'col_contrib_dim18', 'col_contrib_dim19', 'col_contrib_dim20',
       'col_contrib_dim21', 'col_cos2_dim1', 'col_cos2_dim2', 'col_cos2_dim3',
       'col_cos2_dim4', 'col_cos2_dim5', 'col_cos2_dim6', 'col_cos2_dim7',
       'col_cos2_dim8', 'col_cos2_dim9', 'col_cos2_dim10', 'col_cos2_dim11',
       'col_cos2_dim12', 'col_cos2_dim13', 'col_cos2_dim14', 'col_cos2_dim15',
       'col_cos2_dim16', 'col_cos2_dim17', 'col_cos2_dim18', 'col_cos2_dim19',
       'col_cos2_dim20', 'col_cos2_dim21'],
      dtype='object')
In [ ]:
#coordoonées dans le premier plan factoriel
coord_col = info_col[['col_coord_dim1', 'col_coord_dim2']]
print(coord_col)
                   col_coord_dim1  col_coord_dim2
Reading_n                0.733412       -0.041586
Reading_y               -0.362301        0.020543
Listening_music_n        0.839848        0.254103
Listening_music_y       -0.318176       -0.096267
Cinema_n                 0.527020        0.307260
Cinema_y                -0.718375       -0.418822
Show_n                   0.405560        0.112424
Show_y                  -0.943322       -0.261496
Exhibition_n             0.431890       -0.007079
Exhibition_y            -0.929998        0.015243
Computer_n               0.455623        0.200060
Computer_y              -0.681404       -0.299199
Sport_n                  0.415821        0.181959
Sport_y                 -0.653901       -0.286140
Walking_n                0.412714       -0.338255
Walking_y               -0.397413        0.325715
Travelling_n             0.496057        0.007750
Travelling_y            -0.711117       -0.011110
Playing_music_n          0.214572        0.029402
Playing_music_y         -0.972008       -0.133189
Collecting_n             0.067469       -0.057665
Collecting_y            -0.559681        0.478349
Volunteering_n           0.139880       -0.045265
Volunteering_y          -0.751258        0.243108
Mechanic_n               0.307265       -0.345452
Mechanic_y              -0.394287        0.443290
Gardening_n              0.177713       -0.553866
Gardening_y             -0.261996        0.816546
Knitting_n               0.050650       -0.179588
Knitting_y              -0.252092        0.893832
Cooking_n                0.311962       -0.342517
Cooking_y               -0.378000        0.415023
Fishing_n               -0.003218       -0.106732
Fishing_y                0.024103        0.799389
TV_n0                    0.473323       -0.317982
TV_n1                   -0.271869       -0.137644
TV_n2                   -0.175706        0.109968
TV_n3                   -0.028438        0.218662
TV_n4                    0.132535       -0.060858
In [ ]:
#pour mieux rendre compte des dispersions
#affichage dans le premier plan factoriel
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-1.2,+1.2,-1.2,+1.2])
ax.plot([-1.2,+1.2],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-1.2,+1.2],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités")
for x,y,lbl in zip(coord_col.iloc[:,0],coord_col.iloc[:,1],coord_col.index):
    ax.text(x,y,lbl,horizontalalignment='center',verticalalignment='center',fontsize=7)
 
plt.show()

Poids des modalités¶

In [ ]:
#fréquences des modalités
acm.c_
Out[ ]:
array([[2591., 5245., 2153., 5683., 4520., 3316., 5480., 2356., 5351.,
        2485., 4696., 3140., 4790., 3046., 3844., 3992., 4616., 3220.,
        6419., 1417., 6993.,  843., 6606., 1230., 4404., 3432., 4669.,
        3167., 6525., 1311., 4293., 3543., 6913.,  923.,  951., 1157.,
        2040., 1652., 2036.]])
In [ ]:
#vérification, première variable
import numpy
numpy.unique(X.Reading.values,return_counts=True)
Out[ ]:
(array(['n', 'y'], dtype=object), array([2591, 5245], dtype=int64))
In [ ]:
#dernière variable
numpy.unique(X.TV.values,return_counts=True)
Out[ ]:
(array(['n0', 'n1', 'n2', 'n3', 'n4'], dtype=object),
 array([ 951, 1157, 2040, 1652, 2036], dtype=int64))
In [ ]:
#MASS = poids relatif des modalités
#Livre Analyse Factorielle, page 297
#Qui constitue aussi le profil moyen : Livre, page 290
col_mass = acm.c_/(n*p)
print(col_mass)
[[0.01836963 0.03718592 0.01526431 0.04029125 0.03204583 0.02350973
  0.03885202 0.01670353 0.03793744 0.01761812 0.03329363 0.02226193
  0.03396007 0.02159549 0.02725313 0.02830242 0.03272645 0.02282911
  0.04550933 0.01004623 0.04957887 0.00597669 0.04683512 0.00872044
  0.03122341 0.02433214 0.03310221 0.02245335 0.04626085 0.00929471
  0.03043645 0.02511911 0.04901168 0.00654387 0.00674239 0.00820288
  0.01446316 0.01171232 0.0144348 ]]

Clustering à partir de la représentation factorielle¶

In [ ]:
#pas de warning
import warnings
warnings.filterwarnings(action='ignore')

#K-Means de scikit-learn - mettons 3 classes
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,random_state=0)

#entraînement - données pondérées avec l'option sample_weight !!!
km.fit(X=coord_col,sample_weight=col_mass[0])

#info sur les centres de groupes
moyennes = km.cluster_centers_
print(moyennes)
[[ 0.29545093 -0.06231481]
 [-0.60880095 -0.13321424]
 [-0.30111167  0.46770847]]
In [ ]:
#affichage des labels
print(km.labels_)
[0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 2 0 1 0 1 0 2 0 1 0 2 0 2 0 2 0 2 0 2 0 1 2
 2 0]
In [ ]:
#coordonnées avec les groupes
coord_grp = coord_col.copy()
coord_grp["groupes"] = km.labels_

#affichage de contrôle
coord_grp.head()
Out[ ]:
col_coord_dim1 col_coord_dim2 groupes
Reading_n 0.733412 -0.041586 0
Reading_y -0.362301 0.020543 1
Listening_music_n 0.839848 0.254103 0
Listening_music_y -0.318176 -0.096267 1
Cinema_n 0.527020 0.307260 0
In [ ]:
#gestion des groupes
gb = coord_grp.groupby('groupes')

#affichage par indice
for k in gb.indices.keys():
    print("\nGroupe : ",k)
    print(coord_grp.index[gb.indices[k]])
Groupe :  0
Index(['Reading_n', 'Listening_music_n', 'Cinema_n', 'Show_n', 'Exhibition_n',
       'Computer_n', 'Sport_n', 'Walking_n', 'Travelling_n', 'Playing_music_n',
       'Collecting_n', 'Volunteering_n', 'Mechanic_n', 'Gardening_n',
       'Knitting_n', 'Cooking_n', 'Fishing_n', 'TV_n0', 'TV_n4'],
      dtype='object')

Groupe :  1
Index(['Reading_y', 'Listening_music_y', 'Cinema_y', 'Show_y', 'Exhibition_y',
       'Computer_y', 'Sport_y', 'Travelling_y', 'Playing_music_y',
       'Volunteering_y', 'TV_n1'],
      dtype='object')

Groupe :  2
Index(['Walking_y', 'Collecting_y', 'Mechanic_y', 'Gardening_y', 'Knitting_y',
       'Cooking_y', 'Fishing_y', 'TV_n2', 'TV_n3'],
      dtype='object')
In [ ]:
#graphique
import seaborn
fig, ax = plt.subplots(figsize=(7,7))
ax.axis([-1.2,+1.2,-1.2,+1.2])
ax.plot([-1.2,+1.2],[0,0],color='silver',linestyle='--')
ax.plot([0,0],[-1.2,+1.2],color='silver',linestyle='--')
ax.set_xlabel("Dim.1")
ax.set_ylabel("Dim.2")
plt.title("Carte des modalités - Groupes K-Means")
#couleurs pour les groupes
couleurs = ['orangered','seagreen','dodgerblue']
#pour chaque clé de groupe
for k in gb.indices.keys():
    #placer le barycentre associé
    plt.plot(moyennes[k,0],moyennes[k,1],marker="*",markersize=8,color=couleurs[k])
    #pour chaque individu du groupe
    for i in gb.indices[k]:
        ax.text(coord_col.iloc[i,0],coord_col.iloc[i,1],coord_col.index[i],
                color=couleurs[k],horizontalalignment='center',verticalalignment='center',fontsize=7)
 
plt.show()