Importation, inspection et préparation des données¶
In [37]:
# version de la librairie
import discrimintools
discrimintools.__version__
Out[37]:
'0.1.0'
In [38]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [39]:
# chargement des données
import pandas
df = pandas.read_csv("synthetic_health_dataset.csv",sep=",")
df.info()
<class 'pandas.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 IMC 1000 non-null int64 2 PA_systolique 1000 non-null int64 3 Cholesterol 1000 non-null int64 4 Glycemie_jeun 1000 non-null int64 5 Freq_card_repos 1000 non-null int64 6 Taille_cm 1000 non-null int64 7 Score_bien_etre 1000 non-null int64 8 Sexe 1000 non-null str 9 Tabagisme 1000 non-null str 10 Act_physique 1000 non-null str 11 Antec_familial 1000 non-null str 12 Grp_sanguin 1000 non-null str 13 Region_resid 1000 non-null str 14 Risque 1000 non-null str dtypes: int64(8), str(7) memory usage: 117.3 KB
In [40]:
# premières valeurs
df.head()
Out[40]:
| Age | IMC | PA_systolique | Cholesterol | Glycemie_jeun | Freq_card_repos | Taille_cm | Score_bien_etre | Sexe | Tabagisme | Act_physique | Antec_familial | Grp_sanguin | Region_resid | Risque | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | 19 | 118 | 179 | 89 | 84 | 178 | 72 | Femme | Non | elevee | Non | A | Nord | 1Faible |
| 1 | 61 | 31 | 135 | 310 | 119 | 72 | 166 | 74 | Homme | Ancien | Faible | Oui | B | Ouest | 3Eleve |
| 2 | 49 | 29 | 121 | 220 | 95 | 85 | 170 | 67 | Homme | Non | Moderee | Non | A | Sud | 2Modere |
| 3 | 33 | 29 | 133 | 230 | 108 | 75 | 176 | 62 | Homme | Non | Faible | Oui | A | Sud | 2Modere |
| 4 | 56 | 21 | 111 | 192 | 98 | 71 | 169 | 61 | Femme | Non | Moderee | Non | A | Nord | 1Faible |
In [41]:
# distribution des classes
df.Risque.value_counts(normalize=True)
Out[41]:
Risque 1Faible 0.421 2Modere 0.336 3Eleve 0.243 Name: proportion, dtype: float64
In [42]:
# partition train-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.7,stratify=df.Risque,random_state=42)
# distributions
print(dfTrain.Risque.value_counts(normalize=True))
print(dfTest.Risque.value_counts(normalize=True))
Risque 1Faible 0.421429 2Modere 0.335714 3Eleve 0.242857 Name: proportion, dtype: float64 Risque 1Faible 0.420000 2Modere 0.336667 3Eleve 0.243333 Name: proportion, dtype: float64
ADL - Variables quantitatives - DISCRIM¶
Modélisation - Inspection¶
In [43]:
# liste des variables numériques
var_quanti = df.select_dtypes(include="number").columns
print(var_quanti)
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
'Freq_card_repos', 'Taille_cm', 'Score_bien_etre'],
dtype='str')
In [44]:
# Analyse discriminante avec DISCRIM
from discrimintools import DISCRIM, summaryDISCRIM
adl = DISCRIM()
# entraînement
adl.fit(dfTrain[var_quanti],dfTrain.Risque)
# infos
summaryDISCRIM(adl)
Discriminant Analysis - Results
Summary Information:
Infos Value DF DF value
0 Total Sample Size 700 DF Total 699
1 Variables 8 DF Within Classes 697
2 Classes 3 DF Between Classes 2
Class Level Information:
Frequency Proportion Prior Probability
1Faible 295 0.4214 0.4214
2Modere 235 0.3357 0.3357
3Eleve 170 0.2429 0.2429
Linear Discriminant Function for Risque:
1Faible 2Modere 3Eleve
Constant -587.7005 -697.4996 -795.8079
Age 0.8575 1.1320 1.3574
IMC 3.4751 4.3236 4.8782
PA_systolique 1.2590 1.5127 1.7078
Cholesterol 0.6333 0.7599 0.8645
Glycemie_jeun 0.6123 0.7550 0.9370
Freq_card_repos 1.3983 1.4364 1.4723
Taille_cm 3.7012 3.7216 3.6786
Score_bien_etre 0.3197 0.3282 0.3361
In [45]:
# coefficients
adl.coef_
Out[45]:
| 1Faible | 2Modere | 3Eleve | |
|---|---|---|---|
| Constant | -587.700542 | -697.499618 | -795.807887 |
| Age | 0.857454 | 1.131961 | 1.357385 |
| IMC | 3.475061 | 4.323571 | 4.878212 |
| PA_systolique | 1.259028 | 1.512716 | 1.707794 |
| Cholesterol | 0.633308 | 0.759902 | 0.864484 |
| Glycemie_jeun | 0.612254 | 0.755003 | 0.936986 |
| Freq_card_repos | 1.398289 | 1.436436 | 1.472289 |
| Taille_cm | 3.701172 | 3.721553 | 3.678583 |
| Score_bien_etre | 0.319716 | 0.328217 | 0.336118 |
In [46]:
# statistiques - test MANOVA
adl.statistics_.manova
Out[46]:
| Value | Num DF | Den DF | F Value | Pr > F | |
|---|---|---|---|---|---|
| Statistic | |||||
| Wilks' lambda | 0.074122 | 16 | 1380.0 | 230.550716 | 0.0 |
| Pillai's trace | 0.974828 | 16.0 | 1382.0 | 82.133364 | 0.0 |
| Hotelling-Lawley trace | 11.830908 | 16 | 1125.509623 | 509.634654 | 0.0 |
| Roy's greatest root | 11.774822 | 8 | 691 | 1017.050266 | 0.0 |
In [47]:
# VIP - contribution des variables dans le modèle
adl.vip_.vip
Out[47]:
| Wilks' Lambda | Partial R-Square | F Value | Num DF | Den DF | Pr>F | |
|---|---|---|---|---|---|---|
| Age | 0.088568 | 0.836895 | 67.237875 | 2 | 690 | 2.096841e-27 |
| IMC | 0.090120 | 0.822477 | 74.464671 | 2 | 690 | 5.220100e-30 |
| PA_systolique | 0.095952 | 0.772485 | 101.610623 | 2 | 690 | 2.099258e-39 |
| Cholesterol | 0.097300 | 0.761789 | 107.881170 | 2 | 690 | 1.710082e-41 |
| Glycemie_jeun | 0.092730 | 0.799332 | 86.610538 | 2 | 690 | 2.759524e-34 |
| Freq_card_repos | 0.074428 | 0.995892 | 1.423073 | 2 | 690 | 2.416787e-01 |
| Taille_cm | 0.074862 | 0.990116 | 3.444005 | 2 | 690 | 3.248659e-02 |
| Score_bien_etre | 0.074183 | 0.999181 | 0.282785 | 2 | 690 | 7.537693e-01 |
In [48]:
# test Box's M - homoscedasticite
adl.cov_.test
Out[48]:
| Bartlett Value | Num DF | Den DF | F value | Pr>F | Chi Sq. Value | Pr>Chi2 | |
|---|---|---|---|---|---|---|---|
| Box's M | 354.906797 | 72 | 956912 | 4.845104 | 6.252166e-38 | 348.874194 | 6.061101e-38 |
Evaluation en test¶
In [49]:
# numpy
import numpy as np
# prediction
pred = adl.predict(dfTest[var_quanti])
# distribution des classes prédictes
np.unique(pred,return_counts=True)
Out[49]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([127, 103, 70]))
In [50]:
# matrice de confusion
from sklearn import metrics
cm = metrics.confusion_matrix(dfTest.Risque,pred)
print(cm)
[[126 0 0] [ 1 94 6] [ 0 9 64]]
In [51]:
# accuracy score
metrics.accuracy_score(dfTest.Risque,pred)
Out[51]:
0.9466666666666667
/!\ Petite digression sur la caractère ordinal de la variable cible. Indicateur d'évaluation du classement.
In [52]:
# mesure d'évaluation tenant compte du caractère ordinal de la classe
# même si la modélisation n'en a pas tenu compte
# d de Sommers = (proba de concordance - proba de discordance) de Y observé
# au regard de X (la prédiction pour nous) -> +1 association ordinale parfaite
from scipy.stats import somersd
print(somersd(pred,dfTest.Risque))
SomersDResult(statistic=np.float64(0.953805558411295), pvalue=np.float64(0.0), table=array([[126, 1, 0],
[ 0, 94, 9],
[ 0, 6, 64]]))
In [53]:
# ou bien à partir de la matrice confusion <<cm>> ci-dessus
somersd(cm)
Out[53]:
SomersDResult(statistic=np.float64(0.9500290132095437), pvalue=np.float64(0.0), table=array([[126, 0, 0],
[ 1, 94, 6],
[ 0, 9, 64]]))
Sélection de variables - STEPDISC¶
In [54]:
# liste initialie des variables
print(var_quanti)
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
'Freq_card_repos', 'Taille_cm', 'Score_bien_etre'],
dtype='str')
In [55]:
# classe STEPDISC
from discrimintools import STEPDISC
step = STEPDISC(method='forward',alpha=0.01)
# processus de sélection - reprise de l'objet ADL ci-dessus
step.fit(adl)
====================== Step 1 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Age 0.338816 0.661184 680.080974 2 697
IMC 0.325498 0.674502 722.166709 2 697
PA_systolique 0.313098 0.686902 764.568497 2 697
Cholesterol 0.271494 0.728506 935.139632 2 697
Glycemie_jeun 0.304196 0.695804 797.141188 2 697
Freq_card_repos 0.999598 0.000402 0.140050 2 697
Taille_cm 0.988231 0.011769 4.150217 2 697
Score_bien_etre 0.998860 0.001140 0.397780 2 697
Pr>F
Age 1.557963e-164
IMC 1.327842e-170
PA_systolique 1.757076e-176
Cholesterol 4.626402e-198
Glycemie_jeun 7.572269e-181
Freq_card_repos 8.693391e-01
Taille_cm 1.615220e-02
Score_bien_etre 6.719624e-01
Variable Cholesterol will enter
====================== Step 2 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Age 0.181183 0.332644 173.460712 2 696
IMC 0.163714 0.396989 229.103636 2 696
PA_systolique 0.164229 0.395092 227.294150 2 696
Glycemie_jeun 0.167156 0.384308 217.217620 2 696
Freq_card_repos 0.270110 0.005098 1.783249 2 696
Taille_cm 0.267603 0.014331 5.059560 2 696
Score_bien_etre 0.271335 0.000584 0.203329 2 696
Pr>F
Age 7.524236e-62
IMC 3.574793e-77
PA_systolique 1.066281e-76
Glycemie_jeun 4.994778e-74
Freq_card_repos 1.688582e-01
Taille_cm 6.583869e-03
Score_bien_etre 8.160579e-01
Variable IMC will enter
====================== Step 3 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Age 0.127671 2.201559e-01 98.101897 2 695
PA_systolique 0.116789 2.866243e-01 139.620616 2 695
Glycemie_jeun 0.119192 2.719499e-01 129.802321 2 695
Freq_card_repos 0.162123 9.718508e-03 3.410325 2 695
Taille_cm 0.161884 1.117721e-02 3.927984 2 695
Score_bien_etre 0.163714 5.109420e-07 0.000178 2 695
Pr>F
Age 2.969634e-38
PA_systolique 1.066864e-51
Glycemie_jeun 1.261940e-48
Freq_card_repos 3.358421e-02
Taille_cm 2.012184e-02
Score_bien_etre 9.998225e-01
Variable PA_systolique will enter
====================== Step 4 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Age 0.094579 0.190171 81.485608 2 694
Glycemie_jeun 0.089727 0.231722 104.659268 2 694
Freq_card_repos 0.116303 0.004163 1.450688 2 694
Taille_cm 0.115545 0.010652 3.735938 2 694
Score_bien_etre 0.116632 0.001351 0.469431 2 694
Pr>F
Age 1.630995e-32
Glycemie_jeun 1.883373e-40
Freq_card_repos 2.351190e-01
Taille_cm 2.433184e-02
Score_bien_etre 6.255564e-01
Variable Glycemie_jeun will enter
====================== Step 5 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Age 0.075227 0.161596 66.785454 2 693
Freq_card_repos 0.089613 0.001272 0.441420 2 693
Taille_cm 0.088845 0.009830 3.440046 2 693
Score_bien_etre 0.089566 0.001788 0.620535 2 693
Pr>F
Age 2.995750e-27
Freq_card_repos 6.433035e-01
Taille_cm 3.261177e-02
Score_bien_etre 5.379552e-01
Variable Age will enter
====================== Step 6 forward selection results =======================
Wilks' Lambda Partial R-Square F Value Num DF Den DF \
Freq_card_repos 0.074922 0.004060 1.410423 2 692
Taille_cm 0.074485 0.009867 3.448168 2 692
Score_bien_etre 0.075170 0.000754 0.261212 2 692
Pr>F
Freq_card_repos 0.244741
Taille_cm 0.032351
Score_bien_etre 0.770194
No variable can enter
Out[55]:
STEPDISC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| method | 'forward' | |
| alpha | 0.01 | |
| lambda_init | None | |
| verbose | True |
In [56]:
# résumé du processus
step.summary_.summary
Out[56]:
| Wilks' Lambda | Partial R-Square | F Value | Num DF | Den DF | Pr>F | |
|---|---|---|---|---|---|---|
| Cholesterol | 0.271494 | 0.728506 | 935.139632 | 2.0 | 697.0 | 4.626402e-198 |
| IMC | 0.163714 | 0.396989 | 229.103636 | 2.0 | 696.0 | 3.574793e-77 |
| PA_systolique | 0.116789 | 0.286624 | 139.620616 | 2.0 | 695.0 | 1.066864e-51 |
| Glycemie_jeun | 0.089727 | 0.231722 | 104.659268 | 2.0 | 694.0 | 1.883373e-40 |
| Age | 0.075227 | 0.161596 | 66.785454 | 2.0 | 693.0 | 2.995750e-27 |
In [57]:
# liste initialie des variables
print("AVANT : ", np.sort(var_quanti))
# vs. liste des variables sélectionnées
print("APRES : ", np.sort(step.summary_.selected))
AVANT : ['Age' 'Cholesterol' 'Freq_card_repos' 'Glycemie_jeun' 'IMC' 'PA_systolique' 'Score_bien_etre' 'Taille_cm'] APRES : ['Age' 'Cholesterol' 'Glycemie_jeun' 'IMC' 'PA_systolique']
In [58]:
# sous forme de résumé
from discrimintools import summarySTEPDISC
summarySTEPDISC(step)
Stepwise Discriminant Analysis - Results
====================== Before forward selection =======================
Discriminant Analysis - Results
Summary Information:
Infos Value DF DF value
0 Total Sample Size 700 DF Total 699
1 Variables 8 DF Within Classes 697
2 Classes 3 DF Between Classes 2
Class Level Information:
Frequency Proportion Prior Probability
1Faible 295 0.4214 0.4214
2Modere 235 0.3357 0.3357
3Eleve 170 0.2429 0.2429
Linear Discriminant Function for Risque:
1Faible 2Modere 3Eleve
Constant -587.7005 -697.4996 -795.8079
Age 0.8575 1.1320 1.3574
IMC 3.4751 4.3236 4.8782
PA_systolique 1.2590 1.5127 1.7078
Cholesterol 0.6333 0.7599 0.8645
Glycemie_jeun 0.6123 0.7550 0.9370
Freq_card_repos 1.3983 1.4364 1.4723
Taille_cm 3.7012 3.7216 3.6786
Score_bien_etre 0.3197 0.3282 0.3361
====================== After forward selection =======================
Discriminant Analysis - Results
Summary Information:
Infos Value DF DF value
0 Total Sample Size 700 DF Total 699
1 Variables 5 DF Within Classes 697
2 Classes 3 DF Between Classes 2
Class Level Information:
Frequency Proportion Prior Probability
1Faible 295 0.4214 0.4214
2Modere 235 0.3357 0.3357
3Eleve 170 0.2429 0.2429
Linear Discriminant Function for Risque:
1Faible 2Modere 3Eleve
Constant -205.7444 -308.2085 -409.9892
Cholesterol 0.5199 0.6448 0.7490
IMC 3.4180 4.2587 4.8011
PA_systolique 1.4120 1.6667 1.8602
Glycemie_jeun 0.6322 0.7758 0.9587
Age 0.7919 1.0639 1.2864
In [59]:
# prédiction avec le modèle réduit
predSel = step.predict(dfTest[var_quanti])
np.unique(predSel,return_counts=True)
Out[59]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([126, 103, 71]))
In [60]:
# matrice de confusion
print("Matrice de confusion : \n",metrics.confusion_matrix(dfTest.Risque,predSel))
# accuracy
print("\nAccuracy : ", metrics.accuracy_score(dfTest.Risque,predSel))
Matrice de confusion : [[126 0 0] [ 0 94 7] [ 0 9 64]] Accuracy : 0.9466666666666667
Analyse factorielle discriminante¶
In [61]:
# candisc
from discrimintools import CANDISC, summaryCANDISC
afd = CANDISC()
# entraînement -> 2 composantes puisque 3 classes
afd.fit(df[var_quanti],df.Risque)
# summary
summaryCANDISC(afd)
Canonical Discriminant Analysis - Results
Summary Information:
infos Value DF DF value
0 Total Sample Size 1000 DF Total 999
1 Variables 8 DF Within Classes 997
2 Classes 3 DF Between Classes 2
Class Level Information:
Frequency Proportion Prior Probability
1Faible 421 0.421 0.421
2Modere 336 0.336 0.336
3Eleve 243 0.243 0.243
Total-Sample Class Means:
1Faible 2Modere 3Eleve
Age 37.9050 51.9286 63.0905
IMC 21.9952 27.3452 30.8601
PA_systolique 115.0594 134.9970 149.5103
Cholesterol 169.4917 210.9256 244.3539
Glycemie_jeun 85.0214 105.6458 129.3128
Freq_card_repos 75.4608 74.5298 75.3374
Taille_cm 171.5938 172.7143 171.3786
Score_bien_etre 65.0071 64.9256 64.7490
Importance of components:
Eigenvalue Difference Proportion Cumulative
Can1 10.8597 10.8189 99.6259 99.6259
Can2 0.0408 NaN 0.3741 100.0000
Raw Canonical and Classification Functions Coefficients:
Can1 Can2 1Faible 2Modere 3Eleve
Constant -23.8137 10.0939 77.4005 -28.2988 -121.7512
Age 0.0554 -0.0058 -0.1948 0.0572 0.2584
IMC 0.1737 -0.1638 -0.6245 0.2194 0.7786
PA_systolique 0.0477 -0.0230 -0.1693 0.0542 0.2184
Cholesterol 0.0253 -0.0043 -0.0892 0.0266 0.1177
Glycemie_jeun 0.0386 0.0635 -0.1289 0.0211 0.1941
Freq_card_repos 0.0048 0.0274 -0.0139 -0.0028 0.0280
Taille_cm 0.0051 -0.0595 -0.0239 0.0216 0.0115
Score_bien_etre 0.0013 -0.0007 -0.0048 0.0015 0.0062
In [62]:
# distances de Mahalanobis entre les classes
afd.classes_.mahal
Out[62]:
| 1Faible | 2Modere | 3Eleve | |
|---|---|---|---|
| 1Faible | 0.000000 | 20.434427 | 66.984893 |
| 2Modere | 20.434427 | 0.000000 | 13.771719 |
| 3Eleve | 66.984893 | 13.771719 | 0.000000 |
In [63]:
# coordonnées des individus
# dans l'espace fatcoriel
afd.ind_.coord.head()
Out[63]:
| Can1 | Can2 | |
|---|---|---|
| 0 | -3.344178 | 0.574921 |
| 1 | 5.131832 | -0.180666 |
| 2 | 0.317565 | -0.477204 |
| 3 | 0.734462 | -0.505986 |
| 4 | -1.833913 | 1.014346 |
In [64]:
# projection dans le plan avec les classes d'appartenance
import seaborn as sns
sns.scatterplot(afd.ind_.coord,x="Can1",y="Can2",hue=df.Risque,s=15)
Out[64]:
<Axes: xlabel='Can1', ylabel='Can2'>
In [65]:
# outil de visualisation - biplot
from discrimintools import fviz_candisc
fviz_candisc(afd,element='biplot',geom_ind='point',point_args_ind={'size':0.8})
Out[65]:
Analyse discriminante linéaire - Variables mixtes¶
In [66]:
# liste des variables : quanti, quali
dfTrain.columns
Out[66]:
Index(['Age', 'IMC', 'PA_systolique', 'Cholesterol', 'Glycemie_jeun',
'Freq_card_repos', 'Taille_cm', 'Score_bien_etre', 'Sexe', 'Tabagisme',
'Act_physique', 'Antec_familial', 'Grp_sanguin', 'Region_resid',
'Risque'],
dtype='str')
In [67]:
dfTrain.select_dtypes(include=['str','object']).head()
Out[67]:
| Sexe | Tabagisme | Act_physique | Antec_familial | Grp_sanguin | Region_resid | Risque | |
|---|---|---|---|---|---|---|---|
| 778 | Femme | Actuel | Moderee | Oui | B | Sud | 2Modere |
| 956 | Femme | Ancien | Moderee | Oui | B | Sud | 2Modere |
| 711 | Homme | Ancien | Faible | Non | O | Est | 2Modere |
| 414 | Homme | Ancien | Faible | Oui | B | Est | 2Modere |
| 886 | Femme | Non | Faible | Oui | AB | Ouest | 2Modere |
In [68]:
# modalités des variables
_ = dfTrain. \
select_dtypes(include=['str','object']). \
apply(raw=True,func=lambda x: print(np.unique(x,return_counts=True)),axis=0)
(array(['Femme', 'Homme'], dtype=object), array([340, 360])) (array(['Actuel', 'Ancien', 'Non'], dtype=object), array([199, 178, 323])) (array(['Faible', 'Moderee', 'elevee'], dtype=object), array([222, 248, 230])) (array(['Non', 'Oui'], dtype=object), array([377, 323])) (array(['A', 'AB', 'B', 'O'], dtype=object), array([182, 163, 193, 162])) (array(['Est', 'Nord', 'Ouest', 'Sud'], dtype=object), array([158, 192, 147, 203])) (array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([295, 235, 170]))
In [69]:
# modélisation avec l'ensemble des variables
adlMixed = DISCRIM()
adlMixed.fit(dfTrain.drop(columns=['Risque']),dfTrain.Risque)
# affichage des coefficients
adlMixed.coef_
Categorical features have been encoded into binary variables.
Out[69]:
| 1Faible | 2Modere | 3Eleve | |
|---|---|---|---|
| Constant | -613.857872 | -725.317494 | -828.076072 |
| Age | 0.905250 | 1.183821 | 1.414225 |
| IMC | 3.557204 | 4.409117 | 4.978252 |
| PA_systolique | 1.270664 | 1.522208 | 1.719463 |
| Cholesterol | 0.629839 | 0.758895 | 0.866641 |
| Glycemie_jeun | 0.674278 | 0.814519 | 0.997168 |
| Freq_card_repos | 1.438387 | 1.484948 | 1.528955 |
| Taille_cm | 3.786889 | 3.816004 | 3.789481 |
| Score_bien_etre | 0.307521 | 0.310589 | 0.319372 |
| SexeHomme | 1.404210 | 1.660624 | 1.600866 |
| TabagismeAncien | 2.743002 | 2.106216 | -2.874828 |
| TabagismeNon | 3.087176 | 0.816402 | -4.009861 |
| Act_physiqueModeree | 6.354146 | 6.741738 | 5.403188 |
| Act_physiqueelevee | 10.354135 | 8.937648 | 7.832321 |
| Antec_familialOui | 3.915938 | 5.826581 | 8.544247 |
| Grp_sanguinAB | 9.728814 | 10.335045 | 11.190349 |
| Grp_sanguinB | 12.647899 | 13.056246 | 13.569205 |
| Grp_sanguinO | 12.193647 | 12.352963 | 12.650449 |
| Region_residNord | 5.953027 | 6.313861 | 6.292362 |
| Region_residOuest | 6.499395 | 7.722298 | 8.317303 |
| Region_residSud | 8.104428 | 8.417343 | 8.427264 |
In [70]:
# évaluation en test
predMixed = adlMixed.predict(dfTest.drop(columns=['Risque']))
np.unique(predMixed,return_counts=True)
Out[70]:
(array(['1Faible', '2Modere', '3Eleve'], dtype=object), array([127, 99, 74]))
In [71]:
# matrice de confusion
print("Matrice de confusion : \n",metrics.confusion_matrix(dfTest.Risque,predMixed))
# accuracy
print("\nAccuracy : ", metrics.accuracy_score(dfTest.Risque,predMixed))
Matrice de confusion : [[126 0 0] [ 1 95 5] [ 0 4 69]] Accuracy : 0.9666666666666667