#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
vote = pandas.read_excel("vote_clustering.xlsx")
vote.head()
affiliation | budget | physician | salvador | nicaraguan | missile | education | |
---|---|---|---|---|---|---|---|
0 | republican | n | y | y | n | n | y |
1 | republican | n | y | y | n | n | y |
2 | democrat | y | neither | y | n | n | n |
3 | democrat | y | n | neither | n | n | n |
4 | democrat | y | n | y | n | n | neither |
#info
vote.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 435 entries, 0 to 434 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 affiliation 435 non-null object 1 budget 435 non-null object 2 physician 435 non-null object 3 salvador 435 non-null object 4 nicaraguan 435 non-null object 5 missile 435 non-null object 6 education 435 non-null object dtypes: object(7) memory usage: 23.9+ KB
#variables actives
X = vote.iloc[:,1:]
print(X.columns)
Index(['budget', 'physician', 'salvador', 'nicaraguan', 'missile', 'education'], dtype='object')
#codage disjonctif complet
M = pandas.get_dummies(X,drop_first=False)
M.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 435 entries, 0 to 434 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 budget_n 435 non-null uint8 1 budget_neither 435 non-null uint8 2 budget_y 435 non-null uint8 3 physician_n 435 non-null uint8 4 physician_neither 435 non-null uint8 5 physician_y 435 non-null uint8 6 salvador_n 435 non-null uint8 7 salvador_neither 435 non-null uint8 8 salvador_y 435 non-null uint8 9 nicaraguan_n 435 non-null uint8 10 nicaraguan_neither 435 non-null uint8 11 nicaraguan_y 435 non-null uint8 12 missile_n 435 non-null uint8 13 missile_neither 435 non-null uint8 14 missile_y 435 non-null uint8 15 education_n 435 non-null uint8 16 education_neither 435 non-null uint8 17 education_y 435 non-null uint8 dtypes: uint8(18) memory usage: 7.8 KB
import numpy
#fonction distance Dice^2
def funSqDice(col1,col2):
return(0.5*numpy.sum((col1-col2)**2))
#par commodité, transformer M en matrice numpy
MN = M.values
#fonction appliquée aux données
#matrice de distance
D = numpy.zeros(shape=(M.shape[1],M.shape[1]))
for k1 in range(M.shape[1]):
for k2 in range(M.shape[1]):
D[k1,k2] = funSqDice(MN[:,k1],MN[:,k2])
#racine carrée
D = numpy.sqrt(D)
#affichage
print(D[:5,:5])
[[ 0. 9.53939201 14.56021978 13.56465997 9.53939201] [ 9.53939201 0. 11.48912529 11.22497216 2.23606798] [14.56021978 11.48912529 0. 5.56776436 11.26942767] [13.56465997 11.22497216 5.56776436 0. 11.35781669] [ 9.53939201 2.23606798 11.26942767 11.35781669 0. ]]
#vectoriser pour la CAH
from scipy.spatial.distance import squareform
VD = squareform(D)
print(VD)
[ 9.53939201 14.56021978 13.56465997 9.53939201 5.29150262 13.17193987 9.53939201 6.20483682 5.87367006 9.21954446 13.54621718 6.51920241 9.61769203 12.9614814 13.19090596 9.53939201 6.164414 11.48912529 11.22497216 2.23606798 9.59166305 10.27131929 3. 10.41633333 9.5131488 3.16227766 11.06797181 10.27131929 3.80788655 10.14889157 10.86278049 4.12310563 9.38083152 5.56776436 11.26942767 13.6381817 6.51920241 11.18033989 13.28533026 13.47219359 11.40175425 5.70087713 13.13392554 11.02270384 7.07106781 6.4807407 11.18033989 13.3041347 11.35781669 14.56021978 5.70087713 11. 13.69306394 13.39776101 11.3137085 5.78791845 13.36038922 10.74709263 6.8556546 6.164414 11.09053651 13.41640786 9.69535971 10.22252415 3. 10.46422477 9.61769203 3.16227766 10.9772492 10.22252415 3.80788655 10.19803903 10.77032961 4.12310563 9.48683298 13.58307771 9.74679434 5.14781507 5.87367006 9.32737905 13.58307771 6.12372436 9.92471662 13.03840481 13.41640786 9.64365076 5.74456265 10.55935604 14.49137675 13.82027496 10.46422477 4.58257569 13.82027496 10.09950494 5.33853913 6.59545298 10.36822068 13.05756486 10.65363788 9.61769203 3.31662479 11.02270384 10.22252415 4.0620192 10.19803903 10.81665383 4.24264069 9.48683298 4.79583152 10.22252415 14. 5. 10.48808848 13.72953022 13.17193987 10.36822068 6.51920241 9.82344135 14.49137675 5.47722558 9.79795897 13.43502884 13.01921657 9.72111105 6.51920241 11.33578405 10.12422837 3.80788655 10.39230485 11. 4.12310563 9.32737905 13.7113092 10.86278049 5.70087713 6.59545298 11.02270384 13.17193987 10.67707825 14.37010786 12.98075499 10.22252415 6.89202438 10.70046728 10.79351657 4.63680925 9.5131488 7. 10.34408043 12.84523258 11.48912529 14.2126704 10.04987562]
#CAH, ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)
print(cah)
[[ 1. 4. 2.23606798 2. ] [ 7. 18. 3.21455025 3. ] [10. 19. 3.41565026 4. ] [13. 20. 4.31277173 5. ] [ 6. 11. 4.58257569 2. ] [16. 21. 4.75043858 6. ] [ 8. 9. 4.79583152 2. ] [ 0. 5. 5.29150262 2. ] [12. 24. 5.38516481 3. ] [ 2. 3. 5.56776436 2. ] [14. 22. 5.8022984 3. ] [17. 25. 6.164414 3. ] [15. 27. 6.55743852 3. ] [26. 29. 7.51664819 6. ] [28. 30. 7.76745347 6. ] [23. 31. 21.49030789 12. ] [32. 33. 29.09467305 18. ]]
#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=M.columns,orientation='left',color_threshold=0)
plt.show()
#matérialisation de 3 classes
plt.title("CAH - 3 classes")
dendrogram(cah,labels=M.columns,orientation='left',color_threshold=15.0)
plt.show()
#découpage en 3 classes
from scipy.cluster.hierarchy import fcluster
groupes = fcluster(cah,t=15.0,criterion='distance')
print(groupes)
[3 2 1 1 2 3 1 2 3 3 2 1 3 2 1 1 2 3]
#effectifs
print(numpy.unique(groupes,return_counts=True))
(array([1, 2, 3], dtype=int32), array([6, 6, 6], dtype=int64))
#les groupes
for g in numpy.unique(groupes):
print(g," : ",M.columns[groupes==g])
1 : Index(['budget_y', 'physician_n', 'salvador_n', 'nicaraguan_y', 'missile_y', 'education_n'], dtype='object') 2 : Index(['budget_neither', 'physician_neither', 'salvador_neither', 'nicaraguan_neither', 'missile_neither', 'education_neither'], dtype='object') 3 : Index(['budget_n', 'physician_y', 'salvador_y', 'nicaraguan_n', 'missile_n', 'education_y'], dtype='object')
#codage 0/1
M_Aff = pandas.get_dummies(vote.affiliation,drop_first=False)
M_Aff.head()
democrat | republican | |
---|---|---|
0 | 0 | 1 |
1 | 0 | 1 |
2 | 1 | 0 |
3 | 1 | 0 |
4 | 1 | 0 |
#lien des democrates avec les groupes
for g in numpy.unique(groupes):
print(g, " : ",numpy.mean(M.loc[:,groupes==g].apply(axis=0,func=lambda x:funSqDice(x,M_Aff.democrat))))
1 : 33.5 2 : 130.91666666666666 3 : 186.58333333333334