#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
vote = pandas.read_excel("vote_clustering_train.xlsx")
vote.head()
budget | physician | salvador | nicaraguan | missile | education | |
---|---|---|---|---|---|---|
0 | n | y | y | n | n | y |
1 | y | n | n | y | y | n |
2 | y | n | n | y | y | n |
3 | y | n | n | y | y | n |
4 | y | y | y | y | n | n |
#info
vote.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 335 entries, 0 to 334 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 budget 335 non-null object 1 physician 335 non-null object 2 salvador 335 non-null object 3 nicaraguan 335 non-null object 4 missile 335 non-null object 5 education 335 non-null object dtypes: object(6) memory usage: 15.8+ KB
#stat. descriptives
vote.describe()
budget | physician | salvador | nicaraguan | missile | education | |
---|---|---|---|---|---|---|
count | 335 | 335 | 335 | 335 | 335 | 335 |
unique | 3 | 3 | 3 | 3 | 3 | 3 |
top | y | n | n | y | y | n |
freq | 194 | 192 | 165 | 191 | 165 | 180 |
#fréquences des modalités
vote.apply(axis=0,func=lambda x:x.value_counts())
budget | physician | salvador | nicaraguan | missile | education | |
---|---|---|---|---|---|---|
n | 133 | 192 | 165 | 131 | 154 | 180 |
neither | 8 | 10 | 9 | 13 | 16 | 28 |
y | 194 | 133 | 161 | 191 | 165 | 127 |
#installation à la volée de fanalysis (une seule, si nécessaire seulement)
#!pip install fanalysis
#analyse des correspondances multiples
from fanalysis.mca import MCA
acm = MCA(var_labels=vote.columns)
acm.fit(vote.values)
#valeurs propres
print(acm.eig_)
[[7.43441284e-01 3.79687913e-01 1.74117626e-01 1.34147542e-01 1.24902832e-01 1.20304898e-01 8.62967703e-02 7.47046087e-02 5.97803489e-02 4.28345402e-02 3.57503159e-02 2.40313202e-02] [3.71720642e+01 1.89843957e+01 8.70588128e+00 6.70737712e+00 6.24514161e+00 6.01524491e+00 4.31483852e+00 3.73523044e+00 2.98901744e+00 2.14172701e+00 1.78751580e+00 1.20156601e+00] [3.71720642e+01 5.61564599e+01 6.48623412e+01 7.15697183e+01 7.78148599e+01 8.38301048e+01 8.81449433e+01 9.18801737e+01 9.48691912e+01 9.70109182e+01 9.87984340e+01 1.00000000e+02]]
#graphique des valeurs propres
acm.plot_eigenvalues()
#position des modalités dans le plan
acm.mapping_col(num_x_axis=1,num_y_axis=2,short_labels=False)
#récup. des coordonnées des individus dans le plan
coord = acm.row_topandas().iloc[:,:2]
#verif. coordonnées factorielles des 5 premiers individus
coord.head(5)
row_coord_dim1 | row_coord_dim2 | |
---|---|---|
row0 | -1.163452 | 0.183648 |
row1 | 0.893901 | 0.206759 |
row2 | 0.893901 | 0.206759 |
row3 | 0.893901 | 0.206759 |
row4 | -0.141099 | 0.174007 |
#affichage dans le plan
import seaborn as sns
sns.scatterplot(coord,x='row_coord_dim1',y='row_coord_dim2')
<Axes: xlabel='row_coord_dim1', ylabel='row_coord_dim2'>
#ajouter du jittering pour distinguer les points supperposés
import seaborn.objects as so
(
so.Plot(data=coord,x='row_coord_dim1',y='row_coord_dim2')
.add(so.Dots(),so.Jitter(x=0.2,y=0.2))
)
#installer yellobrick si pas déjà fait (une seule fois)
#!pip install yellowbrick
#désactiver les warnings
import warnings
warnings.filterwarnings(action='ignore')
#librairie scikit-learn
from sklearn.cluster import KMeans
#utiliser yellowbrick pour identifier le "bon" nombre de clusters
from yellowbrick.cluster import KElbowVisualizer
coude = KElbowVisualizer(KMeans(),k=(1,10),metric='distortion',timings=False)
coude.fit(coord)
KElbowVisualizer(ax=<Axes: >, estimator=KMeans(n_clusters=9), k=(1, 10), timings=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KElbowVisualizer(ax=<Axes: >, estimator=KMeans(n_clusters=9), k=(1, 10), timings=False)
KMeans(n_clusters=9)
KMeans(n_clusters=9)
#partition en 3 clusters
km = KMeans(n_clusters=3,random_state=0)
km.fit(coord)
#les labels
print(km.labels_)
[0 1 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 2 0 1 1 1 1 1 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 2 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 2 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 2 0 0 0 0 0 1 0 0 0 2 1 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 1 1 1 1 1 1]
#effectifs par groupe
import numpy
print(numpy.unique(km.labels_,return_counts=True))
(array([0, 1, 2]), array([147, 183, 5], dtype=int64))
#données "etiquetées" ppur graphique
lbl_coord = coord.copy()
lbl_coord['groupes'] = km.labels_.astype('S')
#graphique
(
so.Plot(
data=lbl_coord,x='row_coord_dim1',y='row_coord_dim2',color='groupes'
)
.add(so.Dots(),so.Jitter(x=0.2,y=0.2))
)
#croiser chaque variable avec les groupes
for v in vote.columns:
print("{}<< {} >>".format('\n',v.upper()))
print(pandas.crosstab(index=vote[v],columns=km.labels_,normalize='columns'))
<< BUDGET >> col_0 0 1 2 budget n 0.816327 0.071038 0.0 neither 0.020408 0.000000 1.0 y 0.163265 0.928962 0.0 << PHYSICIAN >> col_0 0 1 2 physician n 0.136054 0.939891 0.0 neither 0.006803 0.021858 1.0 y 0.857143 0.038251 0.0 << SALVADOR >> col_0 0 1 2 salvador n 0.006803 0.885246 0.4 neither 0.006803 0.027322 0.6 y 0.986395 0.087432 0.0 << NICARAGUAN >> col_0 0 1 2 nicaraguan n 0.857143 0.027322 0.0 neither 0.054422 0.010929 0.6 y 0.088435 0.961749 0.4 << MISSILE >> col_0 0 1 2 missile n 0.911565 0.109290 0.0 neither 0.013605 0.065574 0.4 y 0.074830 0.825137 0.6 << EDUCATION >> col_0 0 1 2 education n 0.156463 0.852459 0.2 neither 0.088435 0.065574 0.6 y 0.755102 0.081967 0.2
#création d'un pipeline
from sklearn.pipeline import Pipeline
#enchaîner ACM et K-Means
wkf = Pipeline([('acm',MCA(n_components=2,var_labels=vote.columns)),
('km',KMeans(n_clusters=3,random_state=0))])
#apprentissage + création des groupes
clusters = wkf.fit_predict(vote.values)
#effectifs par groupe
print(numpy.unique(clusters,return_counts=True))
(array([0, 1, 2]), array([147, 183, 5], dtype=int64))
#croisement avec l'approche pas-à-pas
pandas.crosstab(lbl_coord['groupes'],clusters)
col_0 | 0 | 1 | 2 |
---|---|---|---|
groupes | |||
b'0' | 147 | 0 | 0 |
b'1' | 0 | 183 | 0 |
b'2' | 0 | 0 | 5 |
#check -- version de scikit-learn
import sklearn
sklearn.__version__
'1.2.2'
#sauvegarde du modèle
import pickle
#créer le fichier en écriture binaire
f = open("workflow.sav","wb")
#sérialisation
pickle.dump(wkf,f)
#ne pas oublier de ferme
f.close()