Importation et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation de la matrice sous forme de data frame
#colonne 0 et ligne 0 = etiquettes
import pandas
dfAdj = pandas.read_excel("matrice_adjacence_iut.xlsx",header=0,index_col=0)
dfAdj.head()
Out[ ]:
BEN BES BOU BRU CAM CHU DUC LAN LEX MAR ... CAL DIF FIR FRE FUM HAD HEL MZA VER VID
IUT
BEN 0 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
BES 0 0 0 1 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BOU 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 1 0 0 1 0 0
BRU 0 1 0 0 1 1 0 1 1 0 ... 0 1 1 1 0 0 1 0 1 0
CAM 0 1 0 1 0 1 0 0 0 0 ... 0 1 1 1 0 0 0 0 1 0

5 rows × 25 columns

In [ ]:
#transformer en matrice numpy
MAdj = dfAdj.values
MAdj[:5,:5]
Out[ ]:
array([[0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 0, 1, 0]], dtype=int64)
In [ ]:
#dimension
MAdj.shape
Out[ ]:
(25, 25)
In [ ]:
#nombre de connexion
import numpy
print(numpy.sum(MAdj)/2)
85.0

Visualisation sous forme de graphe¶

In [ ]:
#verification de version de igraph
import igraph
igraph.__version__
Out[ ]:
'0.10.4'
In [ ]:
#création de la structure graphe
g = igraph.Graph.Adjacency(MAdj,mode=igraph.ADJ_UNDIRECTED)
print(g)
IGRAPH U--- 25 85 --
+ edges:
 0 --  2  8 12 22                           14 --  2  3  9 12 15 18 19 20 22
24
 1 --  3  4  5  6 11                        15 -- 12 14 19 20 24
 2 --  0  9 10 14 19 22                     16 --  3  4 10 13 17 18 21 23
 3 --  1  4  5  7  8 11 14 16 17 18 21 23   17 --  3  4  6  7 12 13 16 18 21
23
 4 --  1  3  5 11 16 17 18 23               18 --  3  4  7 13 14 16 17 21 23
 5 --  1  3  4  6 11 20                     19 --  2  9 12 14 15 20 22 24
 6 --  1  5 11 17                           20 --  5 11 12 14 15 19 22 24
 7 --  3  8 17 18 23                        21 --  3 11 13 16 17 18 23
 8 --  0  3  7 22                           22 --  0  2  8  9 10 14 19 20 24
 9 --  2 10 14 19 22                        23 --  3  4  7 13 16 17 18 21
10 --  2  9 16 22                           24 -- 12 14 15 19 20 22
11 --  1  3  4  5  6 20 21
12 --  0 14 15 17 19 20 24
13 -- 16 17 18 21 23
In [ ]:
#attribuer des étiquettes aux sommets dans le grahique
g.vs['label'] = dfAdj.columns
print(g.vs['label'])
['BEN', 'BES', 'BOU', 'BRU', 'CAM', 'CHU', 'DUC', 'LAN', 'LEX', 'MAR', 'ROG', 'ROS', 'TOS', 'BAR', 'BEL', 'CAL', 'DIF', 'FIR', 'FRE', 'FUM', 'HAD', 'HEL', 'MZA', 'VER', 'VID']
In [ ]:
#affichage du graphe social
#attention, nécessite l'installation du package pycairo
igraph.plot(g,vertex_label_size=10,vertex_size=30,vertex_color='antiquewhite',bbox=(0,0,400,400))
Out[ ]:

Clustering - Découverte de communautés¶

Préparation de la matrice de distance¶

In [ ]:
#transformer en matrice de distance cosinus
from sklearn.metrics.pairwise import cosine_distances
D = cosine_distances(MAdj)

#premières valeurs
print(D[:5,:5])
[[0.         1.         0.79587585 0.85566243 1.        ]
 [1.         0.         1.         0.61270167 0.52565835]
 [0.79587585 1.         0.         0.88214887 1.        ]
 [0.85566243 0.61270167 0.88214887 0.         0.28556549]
 [1.         0.52565835 1.         0.28556549 0.        ]]
In [ ]:
#vérifier s'il s'agit bien du matrice de distance
import scipy
print(scipy.spatial.distance.is_valid_dm(D))
True

Clustering avec la CAH¶

In [ ]:
#vectoriser pour la CAH
from scipy.spatial.distance import squareform
VD = squareform(D)
print(VD[:5])
[1.         0.79587585 0.85566243 1.         1.        ]
In [ ]:
#CAH, ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)

#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=dfAdj.columns,orientation='left',color_threshold=0)
plt.show()
In [ ]:
#matérialisation de 2 classes
plt.title("CAH - 2 classes")
dendrogram(cah,labels=dfAdj.columns,orientation='left',color_threshold=2.0)
plt.show()
In [ ]:
#découpage en 2 classes
from scipy.cluster.hierarchy import fcluster
groupes = fcluster(cah,t=2.0,criterion='distance')
print(groupes)
[1 2 1 2 2 2 2 2 1 1 1 2 1 2 1 1 2 2 2 1 1 2 1 2 1]
In [ ]:
#comptage
print(numpy.unique(groupes,return_counts=True))
(array([1, 2], dtype=int32), array([12, 13], dtype=int64))
In [ ]:
#couleurs des groupes
couleurs = ['#df9012','#60bca0']

#reconfigurer de graphe avec les groupes d'appartenance
for grp in numpy.unique(groupes):
    g.vs[numpy.where(groupes==grp)[0]]['color'] = couleurs[grp-1]

#afficher
igraph.plot(g,vertex_label_size=10,vertex_size=30,bbox=(0,0,400,400))
Out[ ]:

Détection des médoïdes des groupes¶

Matrice des distances avec les labels.

In [ ]:
#transformer la matrice de distances en data frame
#en spécifiant les indentifiants de ligne et colonne
df_D = pandas.DataFrame(D,index=dfAdj.index,columns=dfAdj.columns)
df_D
Out[ ]:
BEN BES BOU BRU CAM CHU DUC LAN LEX MAR ... CAL DIF FIR FRE FUM HAD HEL MZA VER VID
IUT
BEN 0.000000 1.000000 0.795876 0.855662 1.000000 1.000000 1.000000 0.776393 0.750000 0.552786 ... 0.776393 1.000000 0.841886 1.000000 0.469670 0.646447 1.000000 0.666667 1.000000 0.591752
BES 1.000000 0.000000 1.000000 0.612702 0.525658 0.269703 0.552786 0.800000 0.776393 1.000000 ... 1.000000 0.683772 0.575736 0.701858 1.000000 0.683772 0.661938 1.000000 0.683772 1.000000
BOU 0.795876 1.000000 0.000000 0.882149 1.000000 1.000000 1.000000 1.000000 0.591752 0.269703 ... 0.634852 0.855662 1.000000 0.863917 0.566987 0.566987 1.000000 0.319586 1.000000 0.500000
BRU 0.855662 0.612702 0.882149 0.000000 0.285565 0.646447 0.422650 0.483602 0.855662 0.870901 ... 0.870901 0.489690 0.452277 0.326425 0.897938 0.693814 0.454455 0.807550 0.387628 0.882149
CAM 1.000000 0.525658 1.000000 0.285565 0.000000 0.566987 0.292893 0.367544 0.823223 1.000000 ... 1.000000 0.500000 0.552786 0.528595 1.000000 0.750000 0.198216 1.000000 0.500000 1.000000
CHU 1.000000 0.269703 1.000000 0.646447 0.566987 0.000000 0.591752 0.817426 0.795876 1.000000 ... 0.817426 0.711325 0.612702 0.727834 0.855662 0.855662 0.691393 0.863917 0.711325 0.833333
DUC 1.000000 0.552786 1.000000 0.422650 0.292893 0.591752 0.000000 0.776393 1.000000 1.000000 ... 1.000000 0.823223 1.000000 0.833333 1.000000 0.646447 0.622036 1.000000 0.823223 1.000000
LAN 0.776393 0.800000 1.000000 0.483602 0.367544 0.817426 0.776393 0.000000 0.776393 1.000000 ... 1.000000 0.367544 0.575736 0.552786 1.000000 1.000000 0.323877 0.850929 0.525658 1.000000
LEX 0.750000 0.776393 0.591752 0.855662 0.823223 0.795876 1.000000 0.776393 0.000000 0.776393 ... 1.000000 0.823223 0.683772 0.666667 0.823223 0.823223 0.811018 0.833333 0.646447 0.795876
MAR 0.552786 1.000000 0.269703 0.870901 1.000000 1.000000 1.000000 1.000000 0.776393 0.000000 ... 0.600000 0.841886 1.000000 0.850929 0.525658 0.525658 1.000000 0.403715 1.000000 0.452277
ROG 0.500000 1.000000 0.591752 0.855662 0.823223 1.000000 1.000000 1.000000 0.750000 0.552786 ... 1.000000 1.000000 0.841886 0.833333 0.469670 0.823223 0.811018 0.666667 0.823223 0.795876
ROS 1.000000 0.323877 1.000000 0.563564 0.599108 0.228483 0.622036 0.830969 0.811018 1.000000 ... 0.830969 0.599108 0.521909 0.622036 0.866369 0.866369 0.857143 0.874012 0.599108 0.845697
TOS 1.000000 1.000000 0.537090 0.781782 0.866369 0.845697 0.811018 0.830969 0.811018 0.661938 ... 0.323877 0.866369 1.000000 0.748024 0.465478 0.465478 0.857143 0.370059 0.866369 0.382787
BAR 1.000000 1.000000 1.000000 0.354503 0.367544 1.000000 0.776393 0.400000 1.000000 1.000000 ... 1.000000 0.367544 0.434315 0.403715 1.000000 1.000000 0.323877 1.000000 0.367544 1.000000
BEL 0.525658 0.858579 0.612702 0.908713 0.776393 0.741801 1.000000 0.717157 0.683772 0.575736 ... 0.434315 0.776393 0.700000 0.894591 0.217376 0.440983 0.760954 0.472954 0.776393 0.354503
CAL 0.776393 1.000000 0.634852 0.870901 1.000000 0.817426 1.000000 1.000000 1.000000 0.600000 ... 0.000000 1.000000 0.858579 0.850929 0.367544 0.367544 1.000000 0.403715 1.000000 0.269703
DIF 1.000000 0.683772 0.855662 0.489690 0.500000 0.711325 0.823223 0.367544 0.823223 0.841886 ... 1.000000 0.000000 0.329180 0.292893 1.000000 1.000000 0.331847 0.882149 0.250000 1.000000
FIR 0.841886 0.575736 1.000000 0.452277 0.552786 0.612702 1.000000 0.575736 0.683772 1.000000 ... 0.858579 0.329180 0.000000 0.262135 0.888197 0.888197 0.402386 1.000000 0.217376 0.870901
FRE 1.000000 0.701858 0.863917 0.326425 0.528595 0.727834 0.833333 0.552786 0.666667 0.850929 ... 0.850929 0.292893 0.262135 0.000000 0.882149 0.882149 0.370059 0.888889 0.175042 0.863917
FUM 0.469670 1.000000 0.566987 0.897938 1.000000 0.855662 1.000000 1.000000 0.823223 0.525658 ... 0.367544 1.000000 0.888197 0.882149 0.000000 0.375000 1.000000 0.410744 1.000000 0.278312
HAD 0.646447 0.683772 0.566987 0.693814 0.750000 0.855662 0.646447 1.000000 0.823223 0.525658 ... 0.367544 1.000000 0.888197 0.882149 0.375000 0.000000 0.866369 0.646447 1.000000 0.278312
HEL 1.000000 0.661938 1.000000 0.454455 0.198216 0.691393 0.622036 0.323877 0.811018 1.000000 ... 1.000000 0.331847 0.402386 0.370059 1.000000 0.866369 0.000000 1.000000 0.331847 1.000000
MZA 0.666667 1.000000 0.319586 0.807550 1.000000 0.863917 1.000000 0.850929 0.833333 0.403715 ... 0.403715 0.882149 1.000000 0.888889 0.410744 0.646447 1.000000 0.000000 1.000000 0.591752
VER 1.000000 0.683772 1.000000 0.387628 0.500000 0.711325 0.823223 0.525658 0.646447 1.000000 ... 1.000000 0.250000 0.217376 0.175042 1.000000 1.000000 0.331847 1.000000 0.000000 1.000000
VID 0.591752 1.000000 0.500000 0.882149 1.000000 0.833333 1.000000 1.000000 0.795876 0.452277 ... 0.269703 1.000000 0.870901 0.863917 0.278312 0.278312 1.000000 0.591752 1.000000 0.000000

25 rows × 25 columns

Illustration du calcul pour le premier groupe¶

In [ ]:
#liste des individus du 1er groupe
id_1 = numpy.where(groupes==1)[0]
id_1
Out[ ]:
array([ 0,  2,  8,  9, 10, 12, 14, 15, 19, 20, 22, 24], dtype=int64)
In [ ]:
#construction du sous-data frame corresp.
df_1 = df_D.iloc[id_1,id_1]
df_1
Out[ ]:
BEN BOU LEX MAR ROG TOS BEL CAL FUM HAD MZA VID
IUT
BEN 0.000000 0.795876 0.750000 0.552786 0.500000 1.000000 0.525658 0.776393 0.469670 0.646447 0.666667 0.591752
BOU 0.795876 0.000000 0.591752 0.269703 0.591752 0.537090 0.612702 0.634852 0.566987 0.566987 0.319586 0.500000
LEX 0.750000 0.591752 0.000000 0.776393 0.750000 0.811018 0.683772 1.000000 0.823223 0.823223 0.833333 0.795876
MAR 0.552786 0.269703 0.776393 0.000000 0.552786 0.661938 0.575736 0.600000 0.525658 0.525658 0.403715 0.452277
ROG 0.500000 0.591752 0.750000 0.552786 0.000000 1.000000 0.525658 1.000000 0.469670 0.823223 0.666667 0.795876
TOS 1.000000 0.537090 0.811018 0.661938 1.000000 0.000000 0.521909 0.323877 0.465478 0.465478 0.370059 0.382787
BEL 0.525658 0.612702 0.683772 0.575736 0.525658 0.521909 0.000000 0.434315 0.217376 0.440983 0.472954 0.354503
CAL 0.776393 0.634852 1.000000 0.600000 1.000000 0.323877 0.434315 0.000000 0.367544 0.367544 0.403715 0.269703
FUM 0.469670 0.566987 0.823223 0.525658 0.469670 0.465478 0.217376 0.367544 0.000000 0.375000 0.410744 0.278312
HAD 0.646447 0.566987 0.823223 0.525658 0.823223 0.465478 0.440983 0.367544 0.375000 0.000000 0.646447 0.278312
MZA 0.666667 0.319586 0.833333 0.403715 0.666667 0.370059 0.472954 0.403715 0.410744 0.646447 0.000000 0.591752
VID 0.591752 0.500000 0.795876 0.452277 0.795876 0.382787 0.354503 0.269703 0.278312 0.278312 0.591752 0.000000
In [ ]:
#calcul de la somme en ligne (on aurait en colonne)
sum_1 = numpy.sum(df_1,axis=1)
sum_1
Out[ ]:
IUT
BEN    7.275249
BOU    5.987287
LEX    8.638591
MAR    5.896653
ROG    7.675632
TOS    6.539632
BEL    5.365565
CAL    6.177943
FUM    4.969663
HAD    5.959303
MZA    5.785639
VID    5.291150
dtype: float64
In [ ]:
#numéro du min
print(sum_1.argmin())

#identifiant de l'élément corresp.
print(df_1.index[sum_1.argmin()])
8
FUM

Réalisation des calculs pour l'ensemble des groupes¶

In [ ]:
#médoïdes des groupes
medoides = []

#pour chaque groupe
for grp in numpy.unique(groupes):
    #récupérer les numéros d'individus corresp.
    id_grp = numpy.where(groupes==grp)[0]
    #construire le sous data frame corresp.
    df_grp = df_D.iloc[id_grp,id_grp]
    #calculer la somme
    sum_grp = df_grp.sum(axis=1)
    #affichage de contrôle
    print('\nGroupe {}'.format(grp))
    print(sum_grp)
    #récupérer le numéro du min
    num_min = sum_grp.argmin()
    #récupérer le label corresp.
    medoides.append(df_grp.index[num_min])

#afficher les medoides
print('\nMedoïdes : ', medoides)
Groupe 1
IUT
BEN    7.275249
BOU    5.987287
LEX    8.638591
MAR    5.896653
ROG    7.675632
TOS    6.539632
BEL    5.365565
CAL    6.177943
FUM    4.969663
HAD    5.959303
MZA    5.785639
VID    5.291150
dtype: float64

Groupe 2
IUT
BES    7.391803
BRU    5.479507
CAM    5.284900
CHU    7.575377
DUC    8.136718
LAN    6.821537
ROS    7.198309
BAR    6.626405
DIF    5.746127
FIR    5.936537
FRE    5.796712
HEL    5.569073
VER    5.572524
dtype: float64

Medoïdes :  ['FUM', 'CAM']

Affichage du graphe avec mise en évidence des médoïdes¶

In [ ]:
#mettre les couleurs pour les deux qui nous concernent
g.vs[dfAdj.index.get_loc(medoides[0])]['color'] = '#fd3700'
g.vs[dfAdj.index.get_loc(medoides[1])]['color'] = '#00b400'

#réaffichage du graphe
igraph.plot(g,vertex_label_size=10,vertex_size=30,bbox=(0,0,400,400))
Out[ ]: