Chargement et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement
import pandas
D =  pandas.read_excel("artificial_2D.xlsx")
D.head()
Out[ ]:
X1 X2
0 4.524627 -4.522390
1 3.711003 0.608184
2 6.558806 -4.890597
3 2.528326 1.751365
4 0.186561 2.410052
In [ ]:
import seaborn as sns
sns.scatterplot(D,x="X1",y="X2",color="silver")
Out[ ]:
<Axes: xlabel='X1', ylabel='X2'>

Clustering "crisp"¶

In [ ]:
#librairie pour K-Means
from sklearn.cluster import KMeans
kmc = KMeans(n_clusters=3,random_state=0,n_init=1)
kmc.fit(D)

#groupes d'appartenance
kmc.labels_
c:\Users\ricco\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
Out[ ]:
array([0, 2, 0, 2, 1, 1, 0, 1, 2, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 2, 0, 2, 0, 2, 1, 2, 1, 1, 2, 1, 1,
       0, 0, 1, 2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 2, 2, 1, 1, 2,
       2, 1, 2, 0, 1, 1, 1, 0, 1, 0, 2, 1, 1, 0, 0, 0, 1, 0, 1, 2, 0, 0,
       1, 0, 2, 1, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0,
       2, 0, 2, 1, 0, 1, 2, 0, 0, 1, 0, 1, 2, 0, 2, 1, 1, 0, 2, 0, 2, 0,
       1, 2, 2, 0, 2, 0, 0, 0, 2, 1, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 2, 0,
       1, 2, 2, 0, 2, 0, 0, 1, 1, 2, 1, 0, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       2, 0, 2, 2, 1, 0, 1, 0, 2, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 1, 0, 2, 1, 2, 1, 2, 0, 1, 1,
       0, 2, 0, 1, 2, 1, 0, 1, 1, 2, 1, 0, 2, 2, 1, 2, 0, 2, 2, 1, 2, 1,
       2, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 0, 2, 0, 1, 0, 2, 1, 0,
       2, 1, 1, 0, 0, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 0,
       2, 0, 2, 2, 0, 0, 1, 0, 0, 1, 0, 2, 2, 2])
In [ ]:
#effectif par groupe
import numpy
print(numpy.unique(kmc.labels_,return_counts=True))
(array([0, 1, 2]), array([100,  98, 102], dtype=int64))
In [ ]:
#rajouter le groupe d'appartenance dans le jeu de données
X = D.copy()
X["groupes"] = kmc.labels_

#illustration dans le plan
sns.scatterplot(data = X,x='X1',y='X2',hue='groupes',palette=['red','green','blue'])
Out[ ]:
<Axes: xlabel='X1', ylabel='X2'>
In [ ]:
#récupérer les barycentres conditionnels
XCenters = pandas.DataFrame(kmc.cluster_centers_,columns=['X1','X2'])
XCenters
Out[ ]:
X1 X2
0 4.923975 -4.934515
1 0.184934 2.052779
2 3.982930 1.125263
In [ ]:
#nuages de points avec les barycentres
sns.scatterplot(data = X,x='X1',y='X2',hue='groupes',palette=['red','green','blue'])
sns.scatterplot(data = XCenters,x='X1',y='X2',color='black',marker="X",s=100)
Out[ ]:
<Axes: xlabel='X1', ylabel='X2'>

Fuzzy K-Means¶

In [ ]:
#librairie scikit-fda (à installer une seule fois)
#!pip install scikit-fda
In [ ]:
#vérification de version
import skfda
skfda.__version__
Out[ ]:
'0.8.1'
In [ ]:
#lancer le fuzzy k-means
from skfda.ml.clustering import FuzzyCMeans #classe de calcul
from skfda.representation.grid import FDataGrid #format de données à présenter à la fonction
#un paramètre supplémentaire avec "fuzzifier" !
fcm = FuzzyCMeans(n_clusters = 3, random_state = 0, n_init = 1, fuzzifier = 2)
fcm.fit(FDataGrid(D))

#labels : groupes d'appartenance
fcm.labels_
Out[ ]:
array([2, 1, 2, 1, 0, 0, 2, 0, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 2,
       0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 1, 2, 1, 2, 1, 0, 1, 0, 0, 1, 0, 0,
       2, 2, 0, 1, 2, 2, 1, 2, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 1, 0, 0, 1,
       1, 0, 1, 2, 0, 0, 0, 2, 0, 2, 1, 0, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2,
       0, 2, 1, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 0, 1, 1, 0, 0, 2, 1, 2,
       1, 2, 1, 0, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 1, 0, 0, 2, 1, 2, 1, 2,
       0, 1, 1, 2, 1, 2, 2, 2, 1, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 1, 2,
       0, 1, 1, 2, 1, 2, 2, 0, 0, 1, 0, 2, 1, 0, 0, 2, 0, 2, 2, 0, 0, 0,
       1, 2, 1, 1, 0, 2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 0, 2,
       0, 0, 1, 1, 2, 1, 1, 0, 1, 1, 1, 2, 0, 2, 1, 0, 1, 0, 1, 2, 0, 0,
       2, 1, 2, 0, 1, 0, 2, 0, 0, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 0, 1, 0,
       1, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 0, 2,
       1, 0, 0, 2, 2, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1, 1, 2,
       1, 2, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 1, 1], dtype=int64)
In [ ]:
#centres de classes
fcm.cluster_centers_
Out[ ]:
FDataGrid(
    array([[[ 0.18731988],
            [ 2.01415007]],
    
           [[ 4.0607    ],
            [ 1.19629239]],
    
           [[ 4.95843814],
            [-4.97711646]]]),
    grid_points=(array([0., 1.]),),
    domain_range=((0.0, 1.0),),
    dataset_name=None,
    argument_names=(None,),
    coordinate_names=(None,),
    extrapolation=None,
    interpolation=SplineInterpolation(interpolation_order=1, monotone=False))
In [ ]:
#info supplémentaire - degré d'appartenance
pandas.DataFrame(fcm.membership_degree_,columns=['C0','C1','C2'])
Out[ ]:
C0 C1 C2
0 0.006305 0.011788 0.981907
1 0.031057 0.955298 0.013645
2 0.026736 0.054526 0.918739
3 0.312985 0.653088 0.033926
4 0.988601 0.009396 0.002003
... ... ... ...
295 0.837043 0.121239 0.041718
296 0.016815 0.027103 0.956083
297 0.085477 0.862568 0.051955
298 0.033699 0.956475 0.009826
299 0.087187 0.797421 0.115392

300 rows × 3 columns

In [ ]:
#faire varier entre [0,1]
def rescale(values):
    return((values-0.33)/0.67)

#graphiques avec les degrés d'appartenance
sns.scatterplot(D.loc[fcm.labels_==0],x="X1",y="X2",color="green",alpha=rescale(fcm.membership_degree_[fcm.labels_==0,0]))
sns.scatterplot(D.loc[fcm.labels_==1],x="X1",y="X2",color="blue",alpha=rescale(fcm.membership_degree_[fcm.labels_==1,1]))
sns.scatterplot(D.loc[fcm.labels_==2],x="X1",y="X2",color="red",alpha=rescale(fcm.membership_degree_[fcm.labels_==2,2]))
Out[ ]:
<Axes: xlabel='X1', ylabel='X2'>

Faire varier le paramètre fuzzifier¶

In [ ]:
#modifier fuzzifier == 1.02, proche de crisp ; 3, plus de lissage ; 5, tout est flou (presque)
fcm_bis = FuzzyCMeans(n_clusters = 3, random_state = 0, n_init = 1, fuzzifier = 3)
fcm_bis.fit(FDataGrid(D))

#graphiques avec les degrés d'appartenance
sns.scatterplot(D.loc[fcm_bis.labels_==0],x="X1",y="X2",color="green",alpha=rescale(fcm_bis.membership_degree_[fcm_bis.labels_==0,0]))
sns.scatterplot(D.loc[fcm_bis.labels_==1],x="X1",y="X2",color="blue",alpha=rescale(fcm_bis.membership_degree_[fcm_bis.labels_==1,1]))
sns.scatterplot(D.loc[fcm_bis.labels_==2],x="X1",y="X2",color="red",alpha=rescale(fcm_bis.membership_degree_[fcm_bis.labels_==2,2]))
Out[ ]:
<Axes: xlabel='X1', ylabel='X2'>