#pour éviter les warning intempestifs
import os
os.environ['OMP_NUM_THREADS'] = '2'
#changer de dossier
os.chdir("C:/Users/ricco/Desktop/demo")
#charger les données
import pandas
df = pandas.read_excel("artificial_2D_for_gapstat.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 V1 300 non-null float64 1 V2 300 non-null float64 dtypes: float64(2) memory usage: 4.8 KB
#stat. descriptive
df.describe()
V1 | V2 | |
---|---|---|
count | 300.000000 | 300.000000 |
mean | 3.055933 | -0.591674 |
std | 2.256377 | 3.257477 |
min | -1.858000 | -7.782500 |
25% | 0.808725 | -4.175550 |
50% | 3.714650 | 0.894350 |
75% | 4.834075 | 2.033175 |
max | 8.030900 | 4.958600 |
#dans le plan
import seaborn as sns
sns.scatterplot(data=df,x='V1',y='V2')
<Axes: xlabel='V1', ylabel='V2'>
#générer une vesion des données
from sklearn.utils import shuffle
dfRnd = df.apply(axis=0,func=lambda x:shuffle(x.values))
#description
dfRnd.describe()
V1 | V2 | |
---|---|---|
count | 300.000000 | 300.000000 |
mean | 3.055933 | -0.591674 |
std | 2.256377 | 3.257477 |
min | -1.858000 | -7.782500 |
25% | 0.808725 | -4.175550 |
50% | 3.714650 | 0.894350 |
75% | 4.834075 | 2.033175 |
max | 8.030900 | 4.958600 |
#mais, on voit bien que les relations entre les individus
#et les variables sont totalement "randomisées"
sns.scatterplot(data=dfRnd,x='V1',y='V2')
<Axes: xlabel='V1', ylabel='V2'>
#max_K
max_K = 10
#vecteur pour stocker les inerties
import numpy
WSS = numpy.zeros(max_K-1)
#valeur des inerties avec les K-Means
from sklearn.cluster import KMeans
for k in range(max_K-1):
km = KMeans(n_clusters=k+2,n_init=10)
km.fit(df)
WSS[k] = km.inertia_
#vérif.
print(WSS)
[1340.48252875 578.59135143 499.35870903 423.90862603 353.20702713 302.1721438 263.5462446 232.72297831 210.60205772]
#TSS - total sum of squares
#n fois la somme des variances pop.
TSS = df.shape[0] * numpy.sum(df.var(axis=0,ddof=0))
print(TSS)
4695.014959903868
#courbe de décroissance de WSS
#en fonction du nombre de groupes
import matplotlib.pyplot as plt
plt.plot(numpy.arange(1,max_K+1),numpy.concatenate((numpy.asarray([TSS]),WSS)),marker='.')
[<matplotlib.lines.Line2D at 0x23202927f10>]
#nb bootstrap
nboot = 25
#matrice des résultats issues des simulations
mat_wss = numpy.zeros((nboot,max_K-1))
#pour chaque boot
for b in range(nboot):
#générer des données aléatoires
dfAlea = df.apply(axis=0,func=lambda x:shuffle(x.values))
#lancer les K-Means
for k in range(max_K-1):
km = KMeans(n_clusters=k+2,n_init=10)
km.fit(dfAlea)
mat_wss[b,k] = km.inertia_
#voir. 5 premières lignes de la matrice
mat_wss[:5,:]
array([[1865.8120818 , 1074.11821772, 636.45993057, 523.86989359, 459.52970763, 403.95648443, 350.32576286, 324.22248891, 290.78649835], [1862.2702088 , 1034.25157466, 636.89004845, 525.29568054, 458.30830128, 406.35187783, 355.18245507, 325.32915043, 293.39918589], [1857.61328136, 1019.05530707, 638.29323567, 510.55906886, 453.68857793, 396.46183807, 355.42046723, 324.87852175, 290.69164972], [1865.92251094, 1067.16623662, 638.07189975, 528.87896613, 456.37782091, 403.53388725, 370.14502889, 320.34274172, 284.07677314], [1858.26301189, 1120.61663455, 629.91273991, 532.28695013, 470.79477543, 410.8064369 , 355.32033934, 323.23946922, 295.80141358]])
#passer en logarithme
log_mat_wss = numpy.log(mat_wss)
#moyenne par colonne
moy_log_wss_H0 = numpy.mean(log_mat_wss,axis=0)
print(moy_log_wss_H0)
[7.52857754 6.95751687 6.45280306 6.26713803 6.1313525 6.00360288 5.87252522 5.77335096 5.675574 ]
#gap_stat
gap_stat = moy_log_wss_H0 - numpy.log(WSS)
print(gap_stat)
[0.32779262 0.59692042 0.23947837 0.21762011 0.26429814 0.29260601 0.29829636 0.32350215 0.32560363]
#rajouter le cas pour k=1
#gap stat est forcément égal à 0 dans notre cas
gap_stat = numpy.insert(gap_stat,0,0)
print(gap_stat)
[0. 0.32779262 0.59692042 0.23947837 0.21762011 0.26429814 0.29260601 0.29829636 0.32350215 0.32560363]
#plot - identification de la solution
#voir doc. du package "cluster" pour R
#https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/clusGap.html
plt.plot(numpy.arange(1,max_K+1),gap_stat,marker='.')
[<matplotlib.lines.Line2D at 0x23202fa5ba0>]
#écarts-type
gap_std = numpy.std(log_mat_wss,axis=0,ddof=0)
print(gap_std)
[0.00293443 0.04325337 0.00510569 0.01434215 0.01973374 0.02152195 0.02148035 0.01957352 0.02198822]
#rajouter la valeur 0 pour k = 1
gap_std = numpy.insert(gap_std,0,0)
print(gap_std)
[0. 0.00293443 0.04325337 0.00510569 0.01434215 0.01973374 0.02152195 0.02148035 0.01957352 0.02198822]
#graphique avec matérialisation des écarts-type
fig,axe = plt.subplots(figsize=(5,5))
axe.plot(numpy.arange(1,max_K+1),gap_stat,linewidth=1,marker='.')
#+ mettre les traits des écarts-type
for k in range(max_K):
axe.plot([k+1,k+1],[gap_stat[k]-gap_std[k],gap_stat[k]+gap_std[k]],color='gray',marker='_',markersize=10,linewidth=1)
plt.show()
#K-means en 3 clusters
km3 = KMeans(n_clusters=3,n_init=10,random_state=0)
km3.fit(df)
#et réprésentation graphique
sns.scatterplot(data=df,x='V1',y='V2',hue=km3.labels_,palette=['coral','limegreen','cornflowerblue'])
<Axes: xlabel='V1', ylabel='V2'>