Importation, inspection et préparation des données¶

In [37]:
# version de la librairie
import discrimintools
discrimintools.__version__
Out[37]:
'0.1.0'
In [38]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [39]:
# chargement des données
import pandas
df = pandas.read_csv("synthetic_health_dataset.csv",sep=",")
df.info()
<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Age              1000 non-null   int64
 1   IMC              1000 non-null   int64
 2   PA_systolique    1000 non-null   int64
 3   Cholesterol      1000 non-null   int64
 4   Glycemie_jeun    1000 non-null   int64
 5   Freq_card_repos  1000 non-null   int64
 6   Taille_cm        1000 non-null   int64
 7   Score_bien_etre  1000 non-null   int64
 8   Sexe             1000 non-null   str  
 9   Tabagisme        1000 non-null   str  
 10  Act_physique     1000 non-null   str  
 11  Antec_familial   1000 non-null   str  
 12  Grp_sanguin      1000 non-null   str  
 13  Region_resid     1000 non-null   str  
 14  Risque           1000 non-null   str  
dtypes: int64(8), str(7)
memory usage: 117.3 KB
In [40]:
# premières valeurs
df.head()
Out[40]:
Age IMC PA_systolique Cholesterol Glycemie_jeun Freq_card_repos Taille_cm Score_bien_etre Sexe Tabagisme Act_physique Antec_familial Grp_sanguin Region_resid Risque
0 39 19 118 179 89 84 178 72 Femme Non elevee Non A Nord 1Faible
1 61 31 135 310 119 72 166 74 Homme Ancien Faible Oui B Ouest 3Eleve
2 49 29 121 220 95 85 170 67 Homme Non Moderee Non A Sud 2Modere
3 33 29 133 230 108 75 176 62 Homme Non Faible Oui A Sud 2Modere
4 56 21 111 192 98 71 169 61 Femme Non Moderee Non A Nord 1Faible
In [41]:
# distribution des classes
df.Risque.value_counts(normalize=True)
Out[41]:
Risque
1Faible    0.421
2Modere    0.336
3Eleve     0.243
Name: proportion, dtype: float64
In [42]:
# partition train-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.7,stratify=df.Risque,random_state=42)

# distributions
print(dfTrain.Risque.value_counts(normalize=True))
print(dfTest.Risque.value_counts(normalize=True))
Risque
1Faible    0.421429
2Modere    0.335714
3Eleve     0.242857
Name: proportion, dtype: float64
Risque
1Faible    0.420000
2Modere    0.336667
3Eleve     0.243333
Name: proportion, dtype: float64

ADL - Variables quantitatives - DISCRIM¶

Modélisation - Inspection¶

In [43]:
# liste des variables numériques
var_quanti = df.select_dtypes(include="number").columns
print(var_quanti)
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
       'Freq_card_repos', 'Taille_cm', 'Score_bien_etre'],
      dtype='str')
In [44]:
# Analyse discriminante avec DISCRIM
from discrimintools import DISCRIM, summaryDISCRIM
adl = DISCRIM()

# entraînement
adl.fit(dfTrain[var_quanti],dfTrain.Risque)

# infos
summaryDISCRIM(adl)
                     Discriminant Analysis - Results                     

Summary Information:
               Infos  Value                  DF  DF value
0  Total Sample Size    700            DF Total       699
1          Variables      8   DF Within Classes       697
2            Classes      3  DF Between Classes         2

Class Level Information:
         Frequency  Proportion  Prior Probability
1Faible        295      0.4214             0.4214
2Modere        235      0.3357             0.3357
3Eleve         170      0.2429             0.2429

Linear Discriminant Function for Risque:
                  1Faible   2Modere    3Eleve
Constant        -587.7005 -697.4996 -795.8079
Age                0.8575    1.1320    1.3574
IMC                3.4751    4.3236    4.8782
PA_systolique      1.2590    1.5127    1.7078
Cholesterol        0.6333    0.7599    0.8645
Glycemie_jeun      0.6123    0.7550    0.9370
Freq_card_repos    1.3983    1.4364    1.4723
Taille_cm          3.7012    3.7216    3.6786
Score_bien_etre    0.3197    0.3282    0.3361
In [45]:
# coefficients
adl.coef_
Out[45]:
1Faible 2Modere 3Eleve
Constant -587.700542 -697.499618 -795.807887
Age 0.857454 1.131961 1.357385
IMC 3.475061 4.323571 4.878212
PA_systolique 1.259028 1.512716 1.707794
Cholesterol 0.633308 0.759902 0.864484
Glycemie_jeun 0.612254 0.755003 0.936986
Freq_card_repos 1.398289 1.436436 1.472289
Taille_cm 3.701172 3.721553 3.678583
Score_bien_etre 0.319716 0.328217 0.336118
In [46]:
# statistiques - test MANOVA
adl.statistics_.manova
Out[46]:
Value Num DF Den DF F Value Pr > F
Statistic
Wilks' lambda 0.074122 16 1380.0 230.550716 0.0
Pillai's trace 0.974828 16.0 1382.0 82.133364 0.0
Hotelling-Lawley trace 11.830908 16 1125.509623 509.634654 0.0
Roy's greatest root 11.774822 8 691 1017.050266 0.0
In [47]:
# VIP - contribution des variables dans le modèle
adl.vip_.vip
Out[47]:
Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F
Age 0.088568 0.836895 67.237875 2 690 2.096841e-27
IMC 0.090120 0.822477 74.464671 2 690 5.220100e-30
PA_systolique 0.095952 0.772485 101.610623 2 690 2.099258e-39
Cholesterol 0.097300 0.761789 107.881170 2 690 1.710082e-41
Glycemie_jeun 0.092730 0.799332 86.610538 2 690 2.759524e-34
Freq_card_repos 0.074428 0.995892 1.423073 2 690 2.416787e-01
Taille_cm 0.074862 0.990116 3.444005 2 690 3.248659e-02
Score_bien_etre 0.074183 0.999181 0.282785 2 690 7.537693e-01
In [48]:
# test Box's M - homoscedasticite
adl.cov_.test
Out[48]:
Bartlett Value Num DF Den DF F value Pr>F Chi Sq. Value Pr>Chi2
Box's M 354.906797 72 956912 4.845104 6.252166e-38 348.874194 6.061101e-38

Evaluation en test¶

In [49]:
# numpy
import numpy as np

# prediction
pred = adl.predict(dfTest[var_quanti])

# distribution des classes prédictes
np.unique(pred,return_counts=True)
Out[49]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([127, 103,  70]))
In [50]:
# matrice de confusion
from sklearn import metrics
cm = metrics.confusion_matrix(dfTest.Risque,pred)
print(cm)
[[126   0   0]
 [  1  94   6]
 [  0   9  64]]
In [51]:
# accuracy score
metrics.accuracy_score(dfTest.Risque,pred)
Out[51]:
0.9466666666666667

/!\ Petite digression sur la caractère ordinal de la variable cible. Indicateur d'évaluation du classement.

In [52]:
# mesure d'évaluation tenant compte du caractère ordinal de la classe
# même si la modélisation n'en a pas tenu compte
# d de Sommers = (proba de concordance - proba de discordance) de Y observé
# au regard de X (la prédiction pour nous) -> +1 association ordinale parfaite
from scipy.stats import somersd
print(somersd(pred,dfTest.Risque))
SomersDResult(statistic=np.float64(0.953805558411295), pvalue=np.float64(0.0), table=array([[126,   1,   0],
       [  0,  94,   9],
       [  0,   6,  64]]))
In [53]:
# ou bien à partir de la matrice confusion <<cm>> ci-dessus
somersd(cm)
Out[53]:
SomersDResult(statistic=np.float64(0.9500290132095437), pvalue=np.float64(0.0), table=array([[126,   0,   0],
       [  1,  94,   6],
       [  0,   9,  64]]))

Sélection de variables - STEPDISC¶

In [54]:
# liste initialie des variables
print(var_quanti)
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
       'Freq_card_repos', 'Taille_cm', 'Score_bien_etre'],
      dtype='str')
In [55]:
# classe STEPDISC
from discrimintools import STEPDISC
step = STEPDISC(method='forward',alpha=0.01)

# processus de sélection - reprise de l'objet ADL ci-dessus
step.fit(adl)
====================== Step 1 forward selection results =======================
                 Wilks' Lambda  Partial R-Square     F Value  Num DF  Den DF  \
Age                   0.338816          0.661184  680.080974       2     697   
IMC                   0.325498          0.674502  722.166709       2     697   
PA_systolique         0.313098          0.686902  764.568497       2     697   
Cholesterol           0.271494          0.728506  935.139632       2     697   
Glycemie_jeun         0.304196          0.695804  797.141188       2     697   
Freq_card_repos       0.999598          0.000402    0.140050       2     697   
Taille_cm             0.988231          0.011769    4.150217       2     697   
Score_bien_etre       0.998860          0.001140    0.397780       2     697   

                          Pr>F  
Age              1.557963e-164  
IMC              1.327842e-170  
PA_systolique    1.757076e-176  
Cholesterol      4.626402e-198  
Glycemie_jeun    7.572269e-181  
Freq_card_repos   8.693391e-01  
Taille_cm         1.615220e-02  
Score_bien_etre   6.719624e-01  

Variable Cholesterol will enter


====================== Step 2 forward selection results =======================
                 Wilks' Lambda  Partial R-Square     F Value  Num DF  Den DF  \
Age                   0.181183          0.332644  173.460712       2     696   
IMC                   0.163714          0.396989  229.103636       2     696   
PA_systolique         0.164229          0.395092  227.294150       2     696   
Glycemie_jeun         0.167156          0.384308  217.217620       2     696   
Freq_card_repos       0.270110          0.005098    1.783249       2     696   
Taille_cm             0.267603          0.014331    5.059560       2     696   
Score_bien_etre       0.271335          0.000584    0.203329       2     696   

                         Pr>F  
Age              7.524236e-62  
IMC              3.574793e-77  
PA_systolique    1.066281e-76  
Glycemie_jeun    4.994778e-74  
Freq_card_repos  1.688582e-01  
Taille_cm        6.583869e-03  
Score_bien_etre  8.160579e-01  

Variable IMC will enter


====================== Step 3 forward selection results =======================
                 Wilks' Lambda  Partial R-Square     F Value  Num DF  Den DF  \
Age                   0.127671      2.201559e-01   98.101897       2     695   
PA_systolique         0.116789      2.866243e-01  139.620616       2     695   
Glycemie_jeun         0.119192      2.719499e-01  129.802321       2     695   
Freq_card_repos       0.162123      9.718508e-03    3.410325       2     695   
Taille_cm             0.161884      1.117721e-02    3.927984       2     695   
Score_bien_etre       0.163714      5.109420e-07    0.000178       2     695   

                         Pr>F  
Age              2.969634e-38  
PA_systolique    1.066864e-51  
Glycemie_jeun    1.261940e-48  
Freq_card_repos  3.358421e-02  
Taille_cm        2.012184e-02  
Score_bien_etre  9.998225e-01  

Variable PA_systolique will enter


====================== Step 4 forward selection results =======================
                 Wilks' Lambda  Partial R-Square     F Value  Num DF  Den DF  \
Age                   0.094579          0.190171   81.485608       2     694   
Glycemie_jeun         0.089727          0.231722  104.659268       2     694   
Freq_card_repos       0.116303          0.004163    1.450688       2     694   
Taille_cm             0.115545          0.010652    3.735938       2     694   
Score_bien_etre       0.116632          0.001351    0.469431       2     694   

                         Pr>F  
Age              1.630995e-32  
Glycemie_jeun    1.883373e-40  
Freq_card_repos  2.351190e-01  
Taille_cm        2.433184e-02  
Score_bien_etre  6.255564e-01  

Variable Glycemie_jeun will enter


====================== Step 5 forward selection results =======================
                 Wilks' Lambda  Partial R-Square    F Value  Num DF  Den DF  \
Age                   0.075227          0.161596  66.785454       2     693   
Freq_card_repos       0.089613          0.001272   0.441420       2     693   
Taille_cm             0.088845          0.009830   3.440046       2     693   
Score_bien_etre       0.089566          0.001788   0.620535       2     693   

                         Pr>F  
Age              2.995750e-27  
Freq_card_repos  6.433035e-01  
Taille_cm        3.261177e-02  
Score_bien_etre  5.379552e-01  

Variable Age will enter


====================== Step 6 forward selection results =======================
                 Wilks' Lambda  Partial R-Square   F Value  Num DF  Den DF  \
Freq_card_repos       0.074922          0.004060  1.410423       2     692   
Taille_cm             0.074485          0.009867  3.448168       2     692   
Score_bien_etre       0.075170          0.000754  0.261212       2     692   

                     Pr>F  
Freq_card_repos  0.244741  
Taille_cm        0.032351  
Score_bien_etre  0.770194  

No variable can enter

Out[55]:
STEPDISC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
method 'forward'
alpha 0.01
lambda_init None
verbose True
In [56]:
# résumé du processus
step.summary_.summary
Out[56]:
Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F
Cholesterol 0.271494 0.728506 935.139632 2.0 697.0 4.626402e-198
IMC 0.163714 0.396989 229.103636 2.0 696.0 3.574793e-77
PA_systolique 0.116789 0.286624 139.620616 2.0 695.0 1.066864e-51
Glycemie_jeun 0.089727 0.231722 104.659268 2.0 694.0 1.883373e-40
Age 0.075227 0.161596 66.785454 2.0 693.0 2.995750e-27
In [57]:
# liste initialie des variables
print("AVANT : ", np.sort(var_quanti))

# vs. liste des variables sélectionnées
print("APRES : ", np.sort(step.summary_.selected))
AVANT :  ['Age' 'Cholesterol' 'Freq_card_repos' 'Glycemie_jeun' 'IMC'
 'PA_systolique' 'Score_bien_etre' 'Taille_cm']
APRES :  ['Age' 'Cholesterol' 'Glycemie_jeun' 'IMC' 'PA_systolique']
In [58]:
# sous forme de résumé
from discrimintools import summarySTEPDISC
summarySTEPDISC(step)
                     Stepwise Discriminant Analysis - Results                     

====================== Before forward selection  =======================

                     Discriminant Analysis - Results                     

Summary Information:
               Infos  Value                  DF  DF value
0  Total Sample Size    700            DF Total       699
1          Variables      8   DF Within Classes       697
2            Classes      3  DF Between Classes         2

Class Level Information:
         Frequency  Proportion  Prior Probability
1Faible        295      0.4214             0.4214
2Modere        235      0.3357             0.3357
3Eleve         170      0.2429             0.2429

Linear Discriminant Function for Risque:
                  1Faible   2Modere    3Eleve
Constant        -587.7005 -697.4996 -795.8079
Age                0.8575    1.1320    1.3574
IMC                3.4751    4.3236    4.8782
PA_systolique      1.2590    1.5127    1.7078
Cholesterol        0.6333    0.7599    0.8645
Glycemie_jeun      0.6123    0.7550    0.9370
Freq_card_repos    1.3983    1.4364    1.4723
Taille_cm          3.7012    3.7216    3.6786
Score_bien_etre    0.3197    0.3282    0.3361

====================== After forward selection  =======================

                     Discriminant Analysis - Results                     

Summary Information:
               Infos  Value                  DF  DF value
0  Total Sample Size    700            DF Total       699
1          Variables      5   DF Within Classes       697
2            Classes      3  DF Between Classes         2

Class Level Information:
         Frequency  Proportion  Prior Probability
1Faible        295      0.4214             0.4214
2Modere        235      0.3357             0.3357
3Eleve         170      0.2429             0.2429

Linear Discriminant Function for Risque:
                1Faible   2Modere    3Eleve
Constant      -205.7444 -308.2085 -409.9892
Cholesterol      0.5199    0.6448    0.7490
IMC              3.4180    4.2587    4.8011
PA_systolique    1.4120    1.6667    1.8602
Glycemie_jeun    0.6322    0.7758    0.9587
Age              0.7919    1.0639    1.2864
In [59]:
# prédiction avec le modèle réduit
predSel = step.predict(dfTest[var_quanti])
np.unique(predSel,return_counts=True)
Out[59]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([126, 103,  71]))
In [60]:
# matrice de confusion
print("Matrice de confusion : \n",metrics.confusion_matrix(dfTest.Risque,predSel))

# accuracy
print("\nAccuracy : ", metrics.accuracy_score(dfTest.Risque,predSel))
Matrice de confusion : 
 [[126   0   0]
 [  0  94   7]
 [  0   9  64]]

Accuracy :  0.9466666666666667

Analyse factorielle discriminante¶

In [61]:
# candisc
from discrimintools import CANDISC, summaryCANDISC
afd = CANDISC()

# entraînement -> 2 composantes puisque 3 classes
afd.fit(df[var_quanti],df.Risque)

# summary
summaryCANDISC(afd)
                     Canonical Discriminant Analysis - Results                     

Summary Information:
               infos  Value                  DF  DF value
0  Total Sample Size   1000            DF Total       999
1          Variables      8   DF Within Classes       997
2            Classes      3  DF Between Classes         2

Class Level Information:
         Frequency  Proportion  Prior Probability
1Faible        421       0.421              0.421
2Modere        336       0.336              0.336
3Eleve         243       0.243              0.243

Total-Sample Class Means:
                  1Faible   2Modere    3Eleve
Age               37.9050   51.9286   63.0905
IMC               21.9952   27.3452   30.8601
PA_systolique    115.0594  134.9970  149.5103
Cholesterol      169.4917  210.9256  244.3539
Glycemie_jeun     85.0214  105.6458  129.3128
Freq_card_repos   75.4608   74.5298   75.3374
Taille_cm        171.5938  172.7143  171.3786
Score_bien_etre   65.0071   64.9256   64.7490

Importance of components:
      Eigenvalue  Difference  Proportion  Cumulative
Can1     10.8597     10.8189     99.6259     99.6259
Can2      0.0408         NaN      0.3741    100.0000

Raw Canonical and Classification Functions Coefficients:
                    Can1     Can2  1Faible  2Modere    3Eleve
Constant        -23.8137  10.0939  77.4005 -28.2988 -121.7512
Age               0.0554  -0.0058  -0.1948   0.0572    0.2584
IMC               0.1737  -0.1638  -0.6245   0.2194    0.7786
PA_systolique     0.0477  -0.0230  -0.1693   0.0542    0.2184
Cholesterol       0.0253  -0.0043  -0.0892   0.0266    0.1177
Glycemie_jeun     0.0386   0.0635  -0.1289   0.0211    0.1941
Freq_card_repos   0.0048   0.0274  -0.0139  -0.0028    0.0280
Taille_cm         0.0051  -0.0595  -0.0239   0.0216    0.0115
Score_bien_etre   0.0013  -0.0007  -0.0048   0.0015    0.0062
In [62]:
# distances de Mahalanobis entre les classes
afd.classes_.mahal
Out[62]:
1Faible 2Modere 3Eleve
1Faible 0.000000 20.434427 66.984893
2Modere 20.434427 0.000000 13.771719
3Eleve 66.984893 13.771719 0.000000
In [63]:
# coordonnées des individus
# dans l'espace fatcoriel
afd.ind_.coord.head()
Out[63]:
Can1 Can2
0 -3.344178 0.574921
1 5.131832 -0.180666
2 0.317565 -0.477204
3 0.734462 -0.505986
4 -1.833913 1.014346
In [64]:
# projection dans le plan avec les classes d'appartenance
import seaborn as sns
sns.scatterplot(afd.ind_.coord,x="Can1",y="Can2",hue=df.Risque,s=15)
Out[64]:
<Axes: xlabel='Can1', ylabel='Can2'>
No description has been provided for this image
In [65]:
# outil de visualisation - biplot
from discrimintools import fviz_candisc
fviz_candisc(afd,element='biplot',geom_ind='point',point_args_ind={'size':0.8})
Out[65]:
No description has been provided for this image

Analyse discriminante linéaire - Variables mixtes¶

In [66]:
# liste des variables : quanti, quali
dfTrain.columns
Out[66]:
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
       'Freq_card_repos', 'Taille_cm', 'Score_bien_etre', 'Sexe', 'Tabagisme',
       'Act_physique', 'Antec_familial', 'Grp_sanguin', 'Region_resid',
       'Risque'],
      dtype='str')
In [67]:
dfTrain.select_dtypes(include=['str','object']).head()
Out[67]:
Sexe Tabagisme Act_physique Antec_familial Grp_sanguin Region_resid Risque
778 Femme Actuel Moderee Oui B Sud 2Modere
956 Femme Ancien Moderee Oui B Sud 2Modere
711 Homme Ancien Faible Non O Est 2Modere
414 Homme Ancien Faible Oui B Est 2Modere
886 Femme Non Faible Oui AB Ouest 2Modere
In [68]:
# modalités des variables
_ = dfTrain. \
    select_dtypes(include=['str','object']). \
    apply(raw=True,func=lambda x: print(np.unique(x,return_counts=True)),axis=0)
(array(['Femme', 'Homme'], dtype=object), array([340, 360]))
(array(['Actuel', 'Ancien', 'Non'], dtype=object), array([199, 178, 323]))
(array(['Faible', 'Moderee', 'elevee'], dtype=object), array([222, 248, 230]))
(array(['Non', 'Oui'], dtype=object), array([377, 323]))
(array(['A', 'AB', 'B', 'O'], dtype=object), array([182, 163, 193, 162]))
(array(['Est', 'Nord', 'Ouest', 'Sud'], dtype=object), array([158, 192, 147, 203]))
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([295, 235, 170]))
In [69]:
# modélisation avec l'ensemble des variables
adlMixed = DISCRIM()
adlMixed.fit(dfTrain.drop(columns=['Risque']),dfTrain.Risque)

# affichage des coefficients
adlMixed.coef_
Categorical features have been encoded into binary variables.

Out[69]:
1Faible 2Modere 3Eleve
Constant -613.857872 -725.317494 -828.076072
Age 0.905250 1.183821 1.414225
IMC 3.557204 4.409117 4.978252
PA_systolique 1.270664 1.522208 1.719463
Cholesterol 0.629839 0.758895 0.866641
Glycemie_jeun 0.674278 0.814519 0.997168
Freq_card_repos 1.438387 1.484948 1.528955
Taille_cm 3.786889 3.816004 3.789481
Score_bien_etre 0.307521 0.310589 0.319372
SexeHomme 1.404210 1.660624 1.600866
TabagismeAncien 2.743002 2.106216 -2.874828
TabagismeNon 3.087176 0.816402 -4.009861
Act_physiqueModeree 6.354146 6.741738 5.403188
Act_physiqueelevee 10.354135 8.937648 7.832321
Antec_familialOui 3.915938 5.826581 8.544247
Grp_sanguinAB 9.728814 10.335045 11.190349
Grp_sanguinB 12.647899 13.056246 13.569205
Grp_sanguinO 12.193647 12.352963 12.650449
Region_residNord 5.953027 6.313861 6.292362
Region_residOuest 6.499395 7.722298 8.317303
Region_residSud 8.104428 8.417343 8.427264
In [70]:
# évaluation en test
predMixed = adlMixed.predict(dfTest.drop(columns=['Risque']))
np.unique(predMixed,return_counts=True)
Out[70]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([127,  99,  74]))
In [71]:
# matrice de confusion
print("Matrice de confusion : \n",metrics.confusion_matrix(dfTest.Risque,predMixed))

# accuracy
print("\nAccuracy : ", metrics.accuracy_score(dfTest.Risque,predMixed))
Matrice de confusion : 
 [[126   0   0]
 [  1  95   5]
 [  0   4  69]]

Accuracy :  0.9666666666666667