Importation et préparation des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
vote = pandas.read_excel("vote_clustering.xlsx")
vote.head()
Out[ ]:
affiliation budget physician salvador nicaraguan missile education
0 republican n y y n n y
1 republican n y y n n y
2 democrat y neither y n n n
3 democrat y n neither n n n
4 democrat y n y n n neither
In [ ]:
#info
vote.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   affiliation  435 non-null    object
 1   budget       435 non-null    object
 2   physician    435 non-null    object
 3   salvador     435 non-null    object
 4   nicaraguan   435 non-null    object
 5   missile      435 non-null    object
 6   education    435 non-null    object
dtypes: object(7)
memory usage: 23.9+ KB
In [ ]:
#variables actives
X = vote.iloc[:,1:]
print(X.columns)
Index(['budget', 'physician', 'salvador', 'nicaraguan', 'missile',
       'education'],
      dtype='object')
In [ ]:
#codage disjonctif complet
M = pandas.get_dummies(X,drop_first=False)
M.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   budget_n            435 non-null    uint8
 1   budget_neither      435 non-null    uint8
 2   budget_y            435 non-null    uint8
 3   physician_n         435 non-null    uint8
 4   physician_neither   435 non-null    uint8
 5   physician_y         435 non-null    uint8
 6   salvador_n          435 non-null    uint8
 7   salvador_neither    435 non-null    uint8
 8   salvador_y          435 non-null    uint8
 9   nicaraguan_n        435 non-null    uint8
 10  nicaraguan_neither  435 non-null    uint8
 11  nicaraguan_y        435 non-null    uint8
 12  missile_n           435 non-null    uint8
 13  missile_neither     435 non-null    uint8
 14  missile_y           435 non-null    uint8
 15  education_n         435 non-null    uint8
 16  education_neither   435 non-null    uint8
 17  education_y         435 non-null    uint8
dtypes: uint8(18)
memory usage: 7.8 KB

CAH sur les modalités¶

In [ ]:
import numpy
#fonction distance Dice^2
def funSqDice(col1,col2):
    return(0.5*numpy.sum((col1-col2)**2))
In [ ]:
#par commodité, transformer M en matrice numpy
MN = M.values

#fonction appliquée aux données
#matrice de distance
D = numpy.zeros(shape=(M.shape[1],M.shape[1]))
for k1 in range(M.shape[1]):
    for k2 in range(M.shape[1]):
        D[k1,k2] = funSqDice(MN[:,k1],MN[:,k2])

#racine carrée
D = numpy.sqrt(D)

#affichage
print(D[:5,:5])
[[ 0.          9.53939201 14.56021978 13.56465997  9.53939201]
 [ 9.53939201  0.         11.48912529 11.22497216  2.23606798]
 [14.56021978 11.48912529  0.          5.56776436 11.26942767]
 [13.56465997 11.22497216  5.56776436  0.         11.35781669]
 [ 9.53939201  2.23606798 11.26942767 11.35781669  0.        ]]
In [ ]:
#vectoriser pour la CAH
from scipy.spatial.distance import squareform
VD = squareform(D)
print(VD)
[ 9.53939201 14.56021978 13.56465997  9.53939201  5.29150262 13.17193987
  9.53939201  6.20483682  5.87367006  9.21954446 13.54621718  6.51920241
  9.61769203 12.9614814  13.19090596  9.53939201  6.164414   11.48912529
 11.22497216  2.23606798  9.59166305 10.27131929  3.         10.41633333
  9.5131488   3.16227766 11.06797181 10.27131929  3.80788655 10.14889157
 10.86278049  4.12310563  9.38083152  5.56776436 11.26942767 13.6381817
  6.51920241 11.18033989 13.28533026 13.47219359 11.40175425  5.70087713
 13.13392554 11.02270384  7.07106781  6.4807407  11.18033989 13.3041347
 11.35781669 14.56021978  5.70087713 11.         13.69306394 13.39776101
 11.3137085   5.78791845 13.36038922 10.74709263  6.8556546   6.164414
 11.09053651 13.41640786  9.69535971 10.22252415  3.         10.46422477
  9.61769203  3.16227766 10.9772492  10.22252415  3.80788655 10.19803903
 10.77032961  4.12310563  9.48683298 13.58307771  9.74679434  5.14781507
  5.87367006  9.32737905 13.58307771  6.12372436  9.92471662 13.03840481
 13.41640786  9.64365076  5.74456265 10.55935604 14.49137675 13.82027496
 10.46422477  4.58257569 13.82027496 10.09950494  5.33853913  6.59545298
 10.36822068 13.05756486 10.65363788  9.61769203  3.31662479 11.02270384
 10.22252415  4.0620192  10.19803903 10.81665383  4.24264069  9.48683298
  4.79583152 10.22252415 14.          5.         10.48808848 13.72953022
 13.17193987 10.36822068  6.51920241  9.82344135 14.49137675  5.47722558
  9.79795897 13.43502884 13.01921657  9.72111105  6.51920241 11.33578405
 10.12422837  3.80788655 10.39230485 11.          4.12310563  9.32737905
 13.7113092  10.86278049  5.70087713  6.59545298 11.02270384 13.17193987
 10.67707825 14.37010786 12.98075499 10.22252415  6.89202438 10.70046728
 10.79351657  4.63680925  9.5131488   7.         10.34408043 12.84523258
 11.48912529 14.2126704  10.04987562]
In [ ]:
#CAH, ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)
print(cah)
[[ 1.          4.          2.23606798  2.        ]
 [ 7.         18.          3.21455025  3.        ]
 [10.         19.          3.41565026  4.        ]
 [13.         20.          4.31277173  5.        ]
 [ 6.         11.          4.58257569  2.        ]
 [16.         21.          4.75043858  6.        ]
 [ 8.          9.          4.79583152  2.        ]
 [ 0.          5.          5.29150262  2.        ]
 [12.         24.          5.38516481  3.        ]
 [ 2.          3.          5.56776436  2.        ]
 [14.         22.          5.8022984   3.        ]
 [17.         25.          6.164414    3.        ]
 [15.         27.          6.55743852  3.        ]
 [26.         29.          7.51664819  6.        ]
 [28.         30.          7.76745347  6.        ]
 [23.         31.         21.49030789 12.        ]
 [32.         33.         29.09467305 18.        ]]
In [ ]:
#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=M.columns,orientation='left',color_threshold=0)
plt.show()
In [ ]:
#matérialisation de 3 classes
plt.title("CAH - 3 classes")
dendrogram(cah,labels=M.columns,orientation='left',color_threshold=15.0)
plt.show()
In [ ]:
#découpage en 3 classes
from scipy.cluster.hierarchy import fcluster
groupes = fcluster(cah,t=15.0,criterion='distance')
print(groupes)
[3 2 1 1 2 3 1 2 3 3 2 1 3 2 1 1 2 3]
In [ ]:
#effectifs
print(numpy.unique(groupes,return_counts=True))
(array([1, 2, 3], dtype=int32), array([6, 6, 6], dtype=int64))
In [ ]:
#les groupes
for g in numpy.unique(groupes):
    print(g," : ",M.columns[groupes==g])
1  :  Index(['budget_y', 'physician_n', 'salvador_n', 'nicaraguan_y', 'missile_y',
       'education_n'],
      dtype='object')
2  :  Index(['budget_neither', 'physician_neither', 'salvador_neither',
       'nicaraguan_neither', 'missile_neither', 'education_neither'],
      dtype='object')
3  :  Index(['budget_n', 'physician_y', 'salvador_y', 'nicaraguan_n', 'missile_n',
       'education_y'],
      dtype='object')

Variable illustrative - Affiliation¶

In [ ]:
#codage 0/1
M_Aff = pandas.get_dummies(vote.affiliation,drop_first=False)
M_Aff.head()
Out[ ]:
democrat republican
0 0 1
1 0 1
2 1 0
3 1 0
4 1 0
In [ ]:
#lien des democrates avec les groupes
for g in numpy.unique(groupes):
    print(g, " : ",numpy.mean(M.loc[:,groupes==g].apply(axis=0,func=lambda x:funSqDice(x,M_Aff.democrat))))
1  :  33.5
2  :  130.91666666666666
3  :  186.58333333333334