#importation
import os
os.chdir("C:/Users/ricco/Desktop/demo")
import pandas
iris = pandas.read_excel("iris_for_measures_clustering.xlsx")
iris.head()
SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
#distribution des "vraies" classes
iris.Species.value_counts()
setosa 50 versicolor 50 virginica 50 Name: Species, dtype: int64
#X - variables actives
X = iris[iris.columns[:-1]]
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SepalLength 150 non-null float64 1 SepalWidth 150 non-null float64 2 PetalLength 150 non-null float64 3 PetalWidth 150 non-null float64 dtypes: float64(4) memory usage: 4.8 KB
#graphiques par paires de variables
import seaborn as sns
sns.pairplot(X,markers=".",height=2)
<seaborn.axisgrid.PairGrid at 0x2430233ddf0>
#k-means en 3 classes
from sklearn.cluster import KMeans
k3 = KMeans(n_clusters=3,n_init=10,random_state=0)
k3.fit(X)
#effectifs par classe
import numpy
numpy.unique(k3.labels_,return_counts=True)
(array([0, 1, 2]), array([62, 50, 38], dtype=int64))
#positions des clusters par paires de variables
tempDf = X.copy()
tempDf['clusters'] = k3.labels_
sns.pairplot(tempDf,hue='clusters',palette=["cornflowerblue","orange","green"],markers=".",height=2) #0, 1, 2
<seaborn.axisgrid.PairGrid at 0x24326d78e80>
#croisement avec les classes réelles
tab3 = pandas.crosstab(iris.Species,k3.labels_)
tab3
col_0 | 0 | 1 | 2 |
---|---|---|---|
Species | |||
setosa | 0 | 50 | 0 |
versicolor | 48 | 0 | 2 |
virginica | 14 | 0 | 36 |
#k-means en 4 classes
k4 = KMeans(n_clusters=4,n_init=10,random_state=0)
k4.fit(X)
#effectifs par classe
import numpy
numpy.unique(k4.labels_,return_counts=True)
(array([0, 1, 2, 3]), array([28, 50, 32, 40], dtype=int64))
#position des clusters par paires de variables
tempDf = X.copy()
tempDf['clusters'] = k4.labels_
sns.pairplot(tempDf,hue='clusters',palette=["cornflowerblue","orange","green","black"],markers=".",height=2) #0, 1, 2, 3
<seaborn.axisgrid.PairGrid at 0x2432bdf0af0>
#croisement avec les "vraies"classes
tab4 = pandas.crosstab(iris.Species,k4.labels_)
tab4
col_0 | 0 | 1 | 2 | 3 |
---|---|---|---|---|
Species | ||||
setosa | 0 | 50 | 0 | 0 |
versicolor | 27 | 0 | 0 | 23 |
virginica | 1 | 0 | 32 | 17 |
#v de Cramer de Scipy
from scipy.stats.contingency import association
print(association(tab3,method="cramer"))
print(association(tab4,method="cramer"))
0.8633255872551836 0.8861231775065392
#librairie metrics de sklearn
from sklearn import metrics
#information mutuelle
print(metrics.mutual_info_score(iris.Species, k3.labels_))
print(metrics.mutual_info_score(iris.Species, k4.labels_))
0.8255910976103357 0.8880235203213085
#information mutuelle normalisée
print(metrics.normalized_mutual_info_score(iris.Species, k3.labels_))
print(metrics.normalized_mutual_info_score(iris.Species, k4.labels_))
0.7581756800057785 0.7219203867820961
#information mutuelle ajustée
print(metrics.adjusted_mutual_info_score(iris.Species, k3.labels_))
print(metrics.adjusted_mutual_info_score(iris.Species, k4.labels_))
0.7551191675800484 0.7172081944051023
#indice de rand
print(metrics.rand_score(iris.Species, k3.labels_))
print(metrics.rand_score(iris.Species, k4.labels_))
0.8797315436241611 0.8539597315436241
#rand score ajusté
print(metrics.adjusted_rand_score(iris.Species, k3.labels_))
print(metrics.adjusted_rand_score(iris.Species, k4.labels_))
0.7302382722834697 0.6498176853819967
#homogeneity score
print(metrics.homogeneity_score(iris.Species, k3.labels_))
print(metrics.homogeneity_score(iris.Species, k4.labels_))
0.7514854021988339 0.8083138423637095
#completness score
print(metrics.completeness_score(iris.Species, k3.labels_))
print(metrics.completeness_score(iris.Species, k4.labels_))
0.7649861514489816 0.6522113355514313
#v-measure : moyenne harmonique
print(metrics.v_measure_score(iris.Species, k3.labels_))
print(metrics.v_measure_score(iris.Species, k4.labels_))
0.7581756800057786 0.721920386782096
#inerties intra-classes - à minimiser
print(k3.inertia_)
print(k4.inertia_)
78.85144142614601 57.22847321428571
#inertie totale
XC = X - numpy.mean(X,axis=0) #centrer les données par rapport à la moyenne
CARDIST = numpy.sum(XC**2,axis=1) #carrés des distances à la moyenne
TSS = numpy.sum(CARDIST) #somme des carrés...
print(TSS)
681.3706
#inerties expliquées par les partitions (BSS/TSS) - à maximiser
print((TSS-k3.inertia_)/TSS)
print((TSS-k4.inertia_)/TSS)
0.8842752513446485 0.9160097702861179
#indice de Calinski-Harabasz - à maximiser
print(metrics.calinski_harabasz_score(X,k3.labels_))
print(metrics.calinski_harabasz_score(X,k4.labels_))
561.62775662962 530.7658081872851
#silhouette score (global) - à maximiser
print(metrics.silhouette_score(X,k3.labels_))
print(metrics.silhouette_score(X,k4.labels_))
0.5528190123564102 0.49805050499728815
#grahique silhouette
from yellowbrick.cluster import SilhouetteVisualizer
visualizer = SilhouetteVisualizer(k3,colors=["cornflowerblue","orange","green"]) #0, 1, 2
visualizer.fit(X)
c:\Users\ricco\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names warnings.warn(
SilhouetteVisualizer(ax=<AxesSubplot: >, colors=['cornflowerblue', 'orange', 'green'], estimator=KMeans(n_clusters=3, random_state=0))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SilhouetteVisualizer(ax=<AxesSubplot: >, colors=['cornflowerblue', 'orange', 'green'], estimator=KMeans(n_clusters=3, random_state=0))
KMeans(n_clusters=3, random_state=0)
KMeans(n_clusters=3, random_state=0)
#indice de Davies-Bouldin - à minimiser
print(metrics.davies_bouldin_score(X,k3.labels_))
print(metrics.davies_bouldin_score(X,k4.labels_))
0.6619715465007465 0.7803069838811107