In [ ]:
#importation
import os
os.chdir("C:/Users/ricco/Desktop/demo")

import pandas
iris = pandas.read_excel("iris_for_measures_clustering.xlsx")
iris.head()
Out[ ]:
SepalLength SepalWidth PetalLength PetalWidth Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [ ]:
#distribution des "vraies" classes
iris.Species.value_counts()
Out[ ]:
setosa        50
versicolor    50
virginica     50
Name: Species, dtype: int64
In [ ]:
#X - variables actives
X = iris[iris.columns[:-1]]
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SepalLength  150 non-null    float64
 1   SepalWidth   150 non-null    float64
 2   PetalLength  150 non-null    float64
 3   PetalWidth   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB
In [ ]:
#graphiques par paires de variables
import seaborn as sns
sns.pairplot(X,markers=".",height=2)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x2430233ddf0>

K-Means avec 3 et 4 clusters¶

In [ ]:
#k-means en 3 classes
from sklearn.cluster import KMeans
k3 = KMeans(n_clusters=3,n_init=10,random_state=0)
k3.fit(X)

#effectifs par classe
import numpy
numpy.unique(k3.labels_,return_counts=True)
Out[ ]:
(array([0, 1, 2]), array([62, 50, 38], dtype=int64))
In [ ]:
#positions des clusters par paires de variables
tempDf = X.copy()
tempDf['clusters'] = k3.labels_
sns.pairplot(tempDf,hue='clusters',palette=["cornflowerblue","orange","green"],markers=".",height=2) #0, 1, 2
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x24326d78e80>
In [ ]:
#croisement avec les classes réelles
tab3 = pandas.crosstab(iris.Species,k3.labels_)
tab3
Out[ ]:
col_0 0 1 2
Species
setosa 0 50 0
versicolor 48 0 2
virginica 14 0 36
In [ ]:
#k-means en 4 classes
k4 = KMeans(n_clusters=4,n_init=10,random_state=0)
k4.fit(X)

#effectifs par classe
import numpy
numpy.unique(k4.labels_,return_counts=True)
Out[ ]:
(array([0, 1, 2, 3]), array([28, 50, 32, 40], dtype=int64))
In [ ]:
#position des clusters par paires de variables
tempDf = X.copy()
tempDf['clusters'] = k4.labels_
sns.pairplot(tempDf,hue='clusters',palette=["cornflowerblue","orange","green","black"],markers=".",height=2) #0, 1, 2, 3
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x2432bdf0af0>
In [ ]:
#croisement avec les "vraies"classes
tab4 = pandas.crosstab(iris.Species,k4.labels_)
tab4
Out[ ]:
col_0 0 1 2 3
Species
setosa 0 50 0 0
versicolor 27 0 0 23
virginica 1 0 32 17

Mesures externes - Classes réelles vs. clusters¶

In [ ]:
#v de Cramer de Scipy
from scipy.stats.contingency import association
print(association(tab3,method="cramer"))
print(association(tab4,method="cramer"))
0.8633255872551836
0.8861231775065392
In [ ]:
#librairie metrics de sklearn
from sklearn import metrics

#information mutuelle
print(metrics.mutual_info_score(iris.Species, k3.labels_))
print(metrics.mutual_info_score(iris.Species, k4.labels_))
0.8255910976103357
0.8880235203213085
In [ ]:
#information mutuelle normalisée
print(metrics.normalized_mutual_info_score(iris.Species, k3.labels_))
print(metrics.normalized_mutual_info_score(iris.Species, k4.labels_))
0.7581756800057785
0.7219203867820961
In [ ]:
#information mutuelle ajustée
print(metrics.adjusted_mutual_info_score(iris.Species, k3.labels_))
print(metrics.adjusted_mutual_info_score(iris.Species, k4.labels_))
0.7551191675800484
0.7172081944051023
In [ ]:
#indice de rand
print(metrics.rand_score(iris.Species, k3.labels_))
print(metrics.rand_score(iris.Species, k4.labels_))
0.8797315436241611
0.8539597315436241
In [ ]:
#rand score ajusté
print(metrics.adjusted_rand_score(iris.Species, k3.labels_))
print(metrics.adjusted_rand_score(iris.Species, k4.labels_))
0.7302382722834697
0.6498176853819967
In [ ]:
#homogeneity score
print(metrics.homogeneity_score(iris.Species, k3.labels_))
print(metrics.homogeneity_score(iris.Species, k4.labels_))
0.7514854021988339
0.8083138423637095
In [ ]:
#completness score
print(metrics.completeness_score(iris.Species, k3.labels_))
print(metrics.completeness_score(iris.Species, k4.labels_))
0.7649861514489816
0.6522113355514313
In [ ]:
#v-measure : moyenne harmonique
print(metrics.v_measure_score(iris.Species, k3.labels_))
print(metrics.v_measure_score(iris.Species, k4.labels_))
0.7581756800057786
0.721920386782096

Mesures internes¶

In [ ]:
#inerties intra-classes - à minimiser
print(k3.inertia_)
print(k4.inertia_)
78.85144142614601
57.22847321428571
In [ ]:
#inertie totale
XC = X - numpy.mean(X,axis=0) #centrer les données par rapport à la moyenne
CARDIST = numpy.sum(XC**2,axis=1) #carrés des distances à la moyenne
TSS = numpy.sum(CARDIST) #somme des carrés...
print(TSS)
681.3706
In [ ]:
#inerties expliquées par les partitions (BSS/TSS) - à maximiser
print((TSS-k3.inertia_)/TSS)
print((TSS-k4.inertia_)/TSS)
0.8842752513446485
0.9160097702861179
In [ ]:
#indice de Calinski-Harabasz - à maximiser
print(metrics.calinski_harabasz_score(X,k3.labels_))
print(metrics.calinski_harabasz_score(X,k4.labels_))
561.62775662962
530.7658081872851
In [ ]:
#silhouette score (global) - à maximiser
print(metrics.silhouette_score(X,k3.labels_))
print(metrics.silhouette_score(X,k4.labels_))
0.5528190123564102
0.49805050499728815
In [ ]:
#grahique silhouette
from yellowbrick.cluster import SilhouetteVisualizer
visualizer = SilhouetteVisualizer(k3,colors=["cornflowerblue","orange","green"]) #0, 1, 2
visualizer.fit(X)
c:\Users\ricco\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names
  warnings.warn(
Out[ ]:
SilhouetteVisualizer(ax=<AxesSubplot: >,
                     colors=['cornflowerblue', 'orange', 'green'],
                     estimator=KMeans(n_clusters=3, random_state=0))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SilhouetteVisualizer(ax=<AxesSubplot: >,
                     colors=['cornflowerblue', 'orange', 'green'],
                     estimator=KMeans(n_clusters=3, random_state=0))
KMeans(n_clusters=3, random_state=0)
KMeans(n_clusters=3, random_state=0)
In [ ]:
#indice de Davies-Bouldin - à minimiser
print(metrics.davies_bouldin_score(X,k3.labels_))
print(metrics.davies_bouldin_score(X,k4.labels_))
0.6619715465007465
0.7803069838811107