Importation et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
jobrate = pandas.read_excel("jobrate.xlsx")
jobrate.head()
Out[ ]:
Communication_Skills Problem_Solving Learning_Ability Judgement_under_Pressure Observational_Skills Willingness_to_Confront_Problems Interest_in_People Interpersonal_Sensitivity Desire_for_Self_Improvement Appearance Dependability Physical_Ability Integrity Overall_Rating
0 2 6 8 3 8 8 5 3 8 7 9 8 6 7
1 7 4 7 5 8 8 7 6 8 5 7 6 6 7
2 5 6 7 5 7 8 6 3 7 7 5 8 7 5
3 6 7 8 6 9 7 7 7 9 8 8 9 9 7
4 9 9 9 9 7 7 9 8 8 7 8 8 8 8
In [ ]:
#information sur les données
jobrate.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Communication_Skills              103 non-null    int64
 1   Problem_Solving                   103 non-null    int64
 2   Learning_Ability                  103 non-null    int64
 3   Judgement_under_Pressure          103 non-null    int64
 4   Observational_Skills              103 non-null    int64
 5   Willingness_to_Confront_Problems  103 non-null    int64
 6   Interest_in_People                103 non-null    int64
 7   Interpersonal_Sensitivity         103 non-null    int64
 8   Desire_for_Self_Improvement       103 non-null    int64
 9   Appearance                        103 non-null    int64
 10  Dependability                     103 non-null    int64
 11  Physical_Ability                  103 non-null    int64
 12  Integrity                         103 non-null    int64
 13  Overall_Rating                    103 non-null    int64
dtypes: int64(14)
memory usage: 11.4 KB
In [ ]:
#variables actives
X = jobrate[jobrate.columns[:-1]]
print(X.columns)
Index(['Communication_Skills', 'Problem_Solving', 'Learning_Ability',
       'Judgement_under_Pressure', 'Observational_Skills',
       'Willingness_to_Confront_Problems', 'Interest_in_People',
       'Interpersonal_Sensitivity', 'Desire_for_Self_Improvement',
       'Appearance', 'Dependability', 'Physical_Ability', 'Integrity'],
      dtype='object')
In [ ]:
#matrice des corrélations
R = X.corr()
print(R)
                                  Communication_Skills  Problem_Solving  \
Communication_Skills                          1.000000         0.628035   
Problem_Solving                               0.628035         1.000000   
Learning_Ability                              0.554550         0.569010   
Judgement_under_Pressure                      0.553767         0.619508   
Observational_Skills                          0.538073         0.428397   
Willingness_to_Confront_Problems              0.526495         0.501478   
Interest_in_People                            0.439102         0.397212   
Interpersonal_Sensitivity                     0.503019         0.439777   
Desire_for_Self_Improvement                   0.564244         0.409005   
Appearance                                    0.491275         0.387283   
Dependability                                 0.547073         0.454569   
Physical_Ability                              0.219193         0.320063   
Integrity                                     0.508065         0.384584   

                                  Learning_Ability  Judgement_under_Pressure  \
Communication_Skills                      0.554550                  0.553767   
Problem_Solving                           0.569010                  0.619508   
Learning_Ability                          1.000000                  0.489166   
Judgement_under_Pressure                  0.489166                  1.000000   
Observational_Skills                      0.622984                  0.373307   
Willingness_to_Confront_Problems          0.524514                  0.400366   
Interest_in_People                        0.273500                  0.622643   
Interpersonal_Sensitivity                 0.185492                  0.613361   
Desire_for_Self_Improvement               0.573672                  0.482604   
Appearance                                0.398800                  0.226605   
Dependability                             0.510967                  0.547136   
Physical_Ability                          0.226865                  0.347631   
Integrity                                 0.314219                  0.588322   

                                  Observational_Skills  \
Communication_Skills                          0.538073   
Problem_Solving                               0.428397   
Learning_Ability                              0.622984   
Judgement_under_Pressure                      0.373307   
Observational_Skills                          1.000000   
Willingness_to_Confront_Problems              0.729956   
Interest_in_People                            0.261634   
Interpersonal_Sensitivity                     0.165465   
Desire_for_Self_Improvement                   0.598463   
Appearance                                    0.417708   
Dependability                                 0.562557   
Physical_Ability                              0.427365   
Integrity                                     0.390568   

                                  Willingness_to_Confront_Problems  \
Communication_Skills                                      0.526495   
Problem_Solving                                           0.501478   
Learning_Ability                                          0.524514   
Judgement_under_Pressure                                  0.400366   
Observational_Skills                                      0.729956   
Willingness_to_Confront_Problems                          1.000000   
Interest_in_People                                        0.223305   
Interpersonal_Sensitivity                                 0.129088   
Desire_for_Self_Improvement                               0.530721   
Appearance                                                0.482461   
Dependability                                             0.487022   
Physical_Ability                                          0.487234   
Integrity                                                 0.326041   

                                  Interest_in_People  \
Communication_Skills                        0.439102   
Problem_Solving                             0.397212   
Learning_Ability                            0.273500   
Judgement_under_Pressure                    0.622643   
Observational_Skills                        0.261634   
Willingness_to_Confront_Problems            0.223305   
Interest_in_People                          1.000000   
Interpersonal_Sensitivity                   0.805146   
Desire_for_Self_Improvement                 0.485749   
Appearance                                  0.267870   
Dependability                               0.607408   
Physical_Ability                            0.376810   
Integrity                                   0.745175   

                                  Interpersonal_Sensitivity  \
Communication_Skills                               0.503019   
Problem_Solving                                    0.439777   
Learning_Ability                                   0.185492   
Judgement_under_Pressure                           0.613361   
Observational_Skills                               0.165465   
Willingness_to_Confront_Problems                   0.129088   
Interest_in_People                                 0.805146   
Interpersonal_Sensitivity                          1.000000   
Desire_for_Self_Improvement                        0.371262   
Appearance                                         0.260027   
Dependability                                      0.540765   
Physical_Ability                                   0.218213   
Integrity                                          0.691997   

                                  Desire_for_Self_Improvement  Appearance  \
Communication_Skills                                 0.564244    0.491275   
Problem_Solving                                      0.409005    0.387283   
Learning_Ability                                     0.573672    0.398800   
Judgement_under_Pressure                             0.482604    0.226605   
Observational_Skills                                 0.598463    0.417708   
Willingness_to_Confront_Problems                     0.530721    0.482461   
Interest_in_People                                   0.485749    0.267870   
Interpersonal_Sensitivity                            0.371262    0.260027   
Desire_for_Self_Improvement                          1.000000    0.447442   
Appearance                                           0.447442    1.000000   
Dependability                                        0.598126    0.508938   
Physical_Ability                                     0.375248    0.382005   
Integrity                                            0.566381    0.413541   

                                  Dependability  Physical_Ability  Integrity  
Communication_Skills                   0.547073          0.219193   0.508065  
Problem_Solving                        0.454569          0.320063   0.384584  
Learning_Ability                       0.510967          0.226865   0.314219  
Judgement_under_Pressure               0.547136          0.347631   0.588322  
Observational_Skills                   0.562557          0.427365   0.390568  
Willingness_to_Confront_Problems       0.487022          0.487234   0.326041  
Interest_in_People                     0.607408          0.376810   0.745175  
Interpersonal_Sensitivity              0.540765          0.218213   0.691997  
Desire_for_Self_Improvement            0.598126          0.375248   0.566381  
Appearance                             0.508938          0.382005   0.413541  
Dependability                          1.000000          0.446120   0.653645  
Physical_Ability                       0.446120          1.000000   0.380959  
Integrity                              0.653645          0.380959   1.000000  
In [ ]:
#représentation graphique - heatmap
import seaborn as sns
sns.heatmap(R,vmin=-1,vmax=+1,cmap='Blues')
Out[ ]:
<AxesSubplot: >
In [ ]:
#ou avec le carré des corrélations
#la valeur min devient 0 dans ce cas
sns.heatmap(R**2,vmin=0,vmax=+1,cmap='Blues')
Out[ ]:
<AxesSubplot: >

CAH sur les variables¶

In [ ]:
import numpy
D = numpy.sqrt(1-R**2)
print(D.iloc[:3,:3])
                      Communication_Skills  Problem_Solving  Learning_Ability
Communication_Skills              0.000000         0.778185           0.83215
Problem_Solving                   0.778185         0.000000           0.82233
Learning_Ability                  0.832150         0.822330           0.00000
In [ ]:
#préparation pour la CAH de scipy
#vectoriser la matrice des distances
from scipy.spatial.distance import squareform
VD = squareform(D)
print(VD)
[0.77818523 0.83215007 0.83267178 0.84289804 0.85017808 0.89843705
 0.86427518 0.82560814 0.87100447 0.83708488 0.97568158 0.8613188
 0.82233043 0.78499017 0.90359063 0.86517037 0.91772702 0.89810686
 0.91253221 0.92196106 0.89071177 0.94739634 0.92309005 0.87219097
 0.78223456 0.85140174 0.96187211 0.98264578 0.8190852  0.91703795
 0.85960042 0.97392625 0.94935047 0.92770778 0.91635538 0.78250602
 0.78980271 0.87583841 0.97398669 0.83704371 0.93763126 0.80862658
 0.68349431 0.96516709 0.9862157  0.80115068 0.90858111 0.82675831
 0.90407921 0.92057396 0.97474855 0.99163318 0.84754664 0.87591723
 0.87338953 0.87327121 0.94535553 0.59307603 0.87409835 0.96345514
 0.79439041 0.92629042 0.66686892 0.92852812 0.9656013  0.84117373
 0.97590127 0.7219     0.89431316 0.80140227 0.92692439 0.82414385
 0.86080325 0.92416038 0.91048572 0.8949733  0.75680154 0.9245921 ]
In [ ]:
#CAH - Ward
from scipy.cluster.hierarchy import ward
cah = ward(VD)
print(cah)
[[ 6.          7.          0.59307603  2.        ]
 [ 4.          5.          0.68349431  2.        ]
 [12.         13.          0.72571077  3.        ]
 [ 0.          1.          0.77818523  2.        ]
 [ 8.         10.          0.80140227  2.        ]
 [ 3.         16.          0.81925402  3.        ]
 [ 2.         17.          0.85193531  3.        ]
 [14.         19.          0.91093491  5.        ]
 [ 9.         11.          0.92416038  2.        ]
 [20.         21.          0.98022744  7.        ]
 [18.         22.          1.03939101 10.        ]
 [15.         23.          1.23922901 13.        ]]
In [ ]:
#dendrogramme
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
plt.title("CAH")
dendrogram(cah,labels=X.columns,orientation='left',color_threshold=0)
plt.show()
In [ ]:
#matérialisation de 4 classes
plt.title("CAH - 4 classes")
dendrogram(cah,labels=X.columns,orientation='left',color_threshold=0.95)
plt.show()
In [ ]:
#découpage effectif en 4 classes
from scipy.cluster.hierarchy import fcluster
groupes = fcluster(cah,t=0.95,criterion='distance')
print(groupes)
[2 2 3 2 3 3 1 1 3 4 3 4 1]
In [ ]:
#comptage
print(numpy.unique(groupes,return_counts=True))
(array([1, 2, 3, 4], dtype=int32), array([3, 3, 5, 2], dtype=int64))
In [ ]:
#liste des variables pour le groupe 1
print(X.columns[groupes == 1])
Index(['Interest_in_People', 'Interpersonal_Sensitivity', 'Integrity'], dtype='object')
In [ ]:
#affichage des groupes
for g in numpy.unique(groupes):
    print(g," : ",X.columns[groupes == g])
1  :  Index(['Interest_in_People', 'Interpersonal_Sensitivity', 'Integrity'], dtype='object')
2  :  Index(['Communication_Skills', 'Problem_Solving', 'Judgement_under_Pressure'], dtype='object')
3  :  Index(['Learning_Ability', 'Observational_Skills',
       'Willingness_to_Confront_Problems', 'Desire_for_Self_Improvement',
       'Dependability'],
      dtype='object')
4  :  Index(['Appearance', 'Physical_Ability'], dtype='object')

Traitement de la variable supplémentaire (overral)¶

In [ ]:
#corrélation avec chaque variable
print(X.corrwith(jobrate.Overall_Rating))
Communication_Skills                0.680250
Problem_Solving                     0.577254
Learning_Ability                    0.592385
Judgement_under_Pressure            0.655048
Observational_Skills                0.584425
Willingness_to_Confront_Problems    0.589032
Interest_in_People                  0.613343
Interpersonal_Sensitivity           0.577246
Desire_for_Self_Improvement         0.667191
Appearance                          0.568722
Dependability                       0.769633
Physical_Ability                    0.438495
Integrity                           0.671647
dtype: float64
In [ ]:
#dataset pour le groupe 1
X[X.columns[groupes == 1]].head()
Out[ ]:
Interest_in_People Interpersonal_Sensitivity Integrity
0 5 3 6
1 7 6 6
2 6 3 7
3 7 7 9
4 9 8 8
In [ ]:
#corrélations des variables du groupe 1 avec "overall"
print(X[X.columns[groupes == 1]].corrwith(jobrate.Overall_Rating))
Interest_in_People           0.613343
Interpersonal_Sensitivity    0.577246
Integrity                    0.671647
dtype: float64
In [ ]:
#moyenne des carrés des corrélations avec les groupes
for g in numpy.unique(groupes):
    print(g," : ",numpy.mean(X[X.columns[groupes==g]].corrwith(jobrate.Overall_Rating)**2))
1  :  0.38683756104936434
2  :  0.40834988274785666
3  :  0.4153820426907955
4  :  0.25786162385199723