Importation et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation des données
import pandas
df = pandas.read_excel("heart_weka_only_male.xls")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   disease          209 non-null    object
 1   age              209 non-null    int64 
 2   chest_pain       209 non-null    object
 3   restbpress       209 non-null    int64 
 4   blood_sugar      209 non-null    object
 5   restecg          209 non-null    object
 6   max_hrate        209 non-null    int64 
 7   exercice_angina  209 non-null    object
dtypes: int64(3), object(5)
memory usage: 13.2+ KB
In [ ]:
#liste des modalités par variable
for v in df.select_dtypes(include='object'):
    print(df[v].value_counts(),'\n')
disease
negative    117
positive     92
Name: count, dtype: int64 

chest_pain
asympt         102
atyp_angina     65
non_anginal     36
typ_angina       6
Name: count, dtype: int64 

blood_sugar
f    193
t     16
Name: count, dtype: int64 

restecg
normal                   174
st_t_wave_abnormality     30
left_vent_hyper            5
Name: count, dtype: int64 

exercice_angina
no     137
yes     72
Name: count, dtype: int64 

La méthode DISMIX¶

Instanciation, entraînement, classement en resubstitution¶

In [ ]:
# Importation de la classe de Calcul
from scientisttools.discriminant_analysis import DISMIX

# Instanciation
mixdisc = DISMIX(n_components=5,
                  target=["disease"],
                  quanti_features_labels=["age","restbpress","max_hrate"],
                  quali_features_labels=["chest_pain","blood_sugar","restecg","exercice_angina"],
                  row_labels=df.index,
                  priors=None)

# Entrainement - Fit
mixdisc.fit(df)
Out[ ]:
DISMIX(n_components=5,
       quali_features_labels=['chest_pain', 'blood_sugar', 'restecg',
                              'exercice_angina'],
       quanti_features_labels=['age', 'restbpress', 'max_hrate'],
       row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DISMIX(n_components=5,
       quali_features_labels=['chest_pain', 'blood_sugar', 'restecg',
                              'exercice_angina'],
       quanti_features_labels=['age', 'restbpress', 'max_hrate'],
       row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
In [ ]:
#résultats immédiats - coefficients
mixdisc.coef_
Out[ ]:
negative positive
age -0.048053 0.061111
restbpress 0.039598 -0.050358
max_hrate 0.103109 -0.131127
chest_pain_asympt -0.085376 0.108577
chest_pain_atyp_angina 0.074180 -0.094337
chest_pain_non_anginal 0.058424 -0.074300
chest_pain_typ_angina -0.035248 0.044826
blood_sugar_f 0.007867 -0.010005
blood_sugar_t -0.027323 0.034748
restecg_left_vent_hyper 0.032314 -0.041095
restecg_normal -0.016439 0.020907
restecg_st_t_wave_abnormality 0.026399 -0.033573
exercice_angina_no 0.054955 -0.069889
exercice_angina_yes -0.075806 0.096405
In [ ]:
#intercept
mixdisc.intercept_
Out[ ]:
negative positive
Intercept -0.877854 -1.302011
In [ ]:
#matrice de confusion en resubstitution
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(df.disease,mixdisc.predict(df.iloc[:,1:]).predict)
Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1a269f3b8e0>

Performances en validation croisée¶

In [ ]:
#outil pour la validation croisée
from sklearn.model_selection import StratifiedKFold
cv_splitter = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
cv_splitter.get_n_splits(df.iloc[:,1:],df.disease)
Out[ ]:
5
In [ ]:
#vecteurs pour les résultats
import numpy
effectifs = numpy.zeros(5)
acc = numpy.zeros(5)

#évaluation en validation croisée
for i,(train_index,test_index) in enumerate(cv_splitter.split(df.iloc[:,1:],df.disease)):
    #sous-data frame
    dfTrain = df.iloc[train_index,:]
    dfTest = df.iloc[test_index,:]
    #apprentissage
    dmx = DISMIX(n_components=5,
                  target=["disease"],
                  quanti_features_labels=["age","restbpress","max_hrate"],
                  quali_features_labels=["chest_pain","blood_sugar","restecg","exercice_angina"],
                  row_labels=dfTrain.index,
                  priors=None)
    dmx.fit(dfTrain)
    #évaluation
    effectifs[i] = dfTest.shape[0]
    acc[i] = metrics.accuracy_score(dfTest.disease,dmx.predict(dfTest.iloc[:,1:]).predict)

#affichage
print(effectifs)
print(acc)
[42. 42. 42. 42. 41.]
[0.73809524 0.88095238 0.73809524 0.85714286 0.65853659]
In [ ]:
#et donc - accuracy en validation croisée
numpy.sum(effectifs*acc)/numpy.sum(effectifs)
Out[ ]:
0.7751196172248804

Décryptage de la méthode DISMIX¶

Analyse factorielle des données mixtes¶

In [ ]:
#FAMD derrière discmix
mixdisc.famd_model_
Out[ ]:
FAMD(n_components=5,
     quali_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'],
     quanti_labels=['age', 'restbpress', 'max_hrate'],
     row_labels=RangeIndex(start=0, stop=209, step=1))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
FAMD(n_components=5,
     quali_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'],
     quanti_labels=['age', 'restbpress', 'max_hrate'],
     row_labels=RangeIndex(start=0, stop=209, step=1))
In [ ]:
#variance restituée
from scientisttools.pyplot import plot_eigenvalues
plot_eigenvalues(mixdisc.famd_model_,choice="eigenvalue")
In [ ]:
#coordonnées factorielles des individus
coord = pandas.DataFrame(mixdisc.famd_model_.row_coord_,columns=['Z'+str(i) for i in range(1,6)])
coord.head(12)
Out[ ]:
Z1 Z2 Z3 Z4 Z5
0 -1.069698 0.966425 0.071447 0.170056 1.060476
1 0.876019 0.898690 -0.368467 -0.498966 0.337081
2 -0.648692 -3.072876 0.430116 -0.108650 -0.572960
3 -0.854773 -1.004573 -0.671663 0.468576 -1.950425
4 0.291900 0.342568 -0.166640 -0.265184 -0.159744
5 0.237483 0.289115 -0.149369 -0.259315 -0.183182
6 2.245878 -5.020715 -1.533806 -0.186040 4.486253
7 2.314921 -0.384783 0.439483 0.991553 -0.881329
8 0.747094 0.101416 -0.333469 -0.512404 -0.212075
9 2.890754 0.204545 0.762626 1.649473 -0.808793
10 -0.629731 -0.992587 -0.062308 1.424108 -2.229672
11 0.290273 0.477398 -0.734068 -1.250931 0.254886
In [ ]:
#rajouter la cible au dataset
coord['disease'] = df.disease

#pairplot
import seaborn as sns
sns.pairplot(data=coord,hue='disease')
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x1a27f61db40>

Analyse discriminante linéaire¶

In [ ]:
#analyse discriminante
mixdisc.lda_model_
Out[ ]:
LDA(features_labels=Index(['Z1', 'Z2', 'Z3', 'Z4', 'Z5'], dtype='object'),
    row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LDA(features_labels=Index(['Z1', 'Z2', 'Z3', 'Z4', 'Z5'], dtype='object'),
    row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
In [ ]:
#coefficients de l'hyperplan séparateur
mixdisc.lda_coef_
Out[ ]:
negative positive
Z1 -0.561837 0.714510
Z2 -0.199247 0.253390
Z3 -0.019332 0.024585
Z4 0.467886 -0.595030
Z5 -0.143060 0.181935
In [ ]:
#intercept
mixdisc.lda_intercept_
Out[ ]:
negative positive
Intercept -0.877854 -1.302011