#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation des données
import pandas
df = pandas.read_excel("heart_weka_only_male.xls")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 209 entries, 0 to 208 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 disease 209 non-null object 1 age 209 non-null int64 2 chest_pain 209 non-null object 3 restbpress 209 non-null int64 4 blood_sugar 209 non-null object 5 restecg 209 non-null object 6 max_hrate 209 non-null int64 7 exercice_angina 209 non-null object dtypes: int64(3), object(5) memory usage: 13.2+ KB
#liste des modalités par variable
for v in df.select_dtypes(include='object'):
print(df[v].value_counts(),'\n')
disease negative 117 positive 92 Name: count, dtype: int64 chest_pain asympt 102 atyp_angina 65 non_anginal 36 typ_angina 6 Name: count, dtype: int64 blood_sugar f 193 t 16 Name: count, dtype: int64 restecg normal 174 st_t_wave_abnormality 30 left_vent_hyper 5 Name: count, dtype: int64 exercice_angina no 137 yes 72 Name: count, dtype: int64
# Importation de la classe de Calcul
from scientisttools.discriminant_analysis import DISMIX
# Instanciation
mixdisc = DISMIX(n_components=5,
target=["disease"],
quanti_features_labels=["age","restbpress","max_hrate"],
quali_features_labels=["chest_pain","blood_sugar","restecg","exercice_angina"],
row_labels=df.index,
priors=None)
# Entrainement - Fit
mixdisc.fit(df)
DISMIX(n_components=5, quali_features_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'], quanti_features_labels=['age', 'restbpress', 'max_hrate'], row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DISMIX(n_components=5, quali_features_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'], quanti_features_labels=['age', 'restbpress', 'max_hrate'], row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
#résultats immédiats - coefficients
mixdisc.coef_
negative | positive | |
---|---|---|
age | -0.048053 | 0.061111 |
restbpress | 0.039598 | -0.050358 |
max_hrate | 0.103109 | -0.131127 |
chest_pain_asympt | -0.085376 | 0.108577 |
chest_pain_atyp_angina | 0.074180 | -0.094337 |
chest_pain_non_anginal | 0.058424 | -0.074300 |
chest_pain_typ_angina | -0.035248 | 0.044826 |
blood_sugar_f | 0.007867 | -0.010005 |
blood_sugar_t | -0.027323 | 0.034748 |
restecg_left_vent_hyper | 0.032314 | -0.041095 |
restecg_normal | -0.016439 | 0.020907 |
restecg_st_t_wave_abnormality | 0.026399 | -0.033573 |
exercice_angina_no | 0.054955 | -0.069889 |
exercice_angina_yes | -0.075806 | 0.096405 |
#intercept
mixdisc.intercept_
negative | positive | |
---|---|---|
Intercept | -0.877854 | -1.302011 |
#matrice de confusion en resubstitution
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(df.disease,mixdisc.predict(df.iloc[:,1:]).predict)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1a269f3b8e0>
#outil pour la validation croisée
from sklearn.model_selection import StratifiedKFold
cv_splitter = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
cv_splitter.get_n_splits(df.iloc[:,1:],df.disease)
5
#vecteurs pour les résultats
import numpy
effectifs = numpy.zeros(5)
acc = numpy.zeros(5)
#évaluation en validation croisée
for i,(train_index,test_index) in enumerate(cv_splitter.split(df.iloc[:,1:],df.disease)):
#sous-data frame
dfTrain = df.iloc[train_index,:]
dfTest = df.iloc[test_index,:]
#apprentissage
dmx = DISMIX(n_components=5,
target=["disease"],
quanti_features_labels=["age","restbpress","max_hrate"],
quali_features_labels=["chest_pain","blood_sugar","restecg","exercice_angina"],
row_labels=dfTrain.index,
priors=None)
dmx.fit(dfTrain)
#évaluation
effectifs[i] = dfTest.shape[0]
acc[i] = metrics.accuracy_score(dfTest.disease,dmx.predict(dfTest.iloc[:,1:]).predict)
#affichage
print(effectifs)
print(acc)
[42. 42. 42. 42. 41.] [0.73809524 0.88095238 0.73809524 0.85714286 0.65853659]
#et donc - accuracy en validation croisée
numpy.sum(effectifs*acc)/numpy.sum(effectifs)
0.7751196172248804
#FAMD derrière discmix
mixdisc.famd_model_
FAMD(n_components=5, quali_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'], quanti_labels=['age', 'restbpress', 'max_hrate'], row_labels=RangeIndex(start=0, stop=209, step=1))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
FAMD(n_components=5, quali_labels=['chest_pain', 'blood_sugar', 'restecg', 'exercice_angina'], quanti_labels=['age', 'restbpress', 'max_hrate'], row_labels=RangeIndex(start=0, stop=209, step=1))
#variance restituée
from scientisttools.pyplot import plot_eigenvalues
plot_eigenvalues(mixdisc.famd_model_,choice="eigenvalue")
#coordonnées factorielles des individus
coord = pandas.DataFrame(mixdisc.famd_model_.row_coord_,columns=['Z'+str(i) for i in range(1,6)])
coord.head(12)
Z1 | Z2 | Z3 | Z4 | Z5 | |
---|---|---|---|---|---|
0 | -1.069698 | 0.966425 | 0.071447 | 0.170056 | 1.060476 |
1 | 0.876019 | 0.898690 | -0.368467 | -0.498966 | 0.337081 |
2 | -0.648692 | -3.072876 | 0.430116 | -0.108650 | -0.572960 |
3 | -0.854773 | -1.004573 | -0.671663 | 0.468576 | -1.950425 |
4 | 0.291900 | 0.342568 | -0.166640 | -0.265184 | -0.159744 |
5 | 0.237483 | 0.289115 | -0.149369 | -0.259315 | -0.183182 |
6 | 2.245878 | -5.020715 | -1.533806 | -0.186040 | 4.486253 |
7 | 2.314921 | -0.384783 | 0.439483 | 0.991553 | -0.881329 |
8 | 0.747094 | 0.101416 | -0.333469 | -0.512404 | -0.212075 |
9 | 2.890754 | 0.204545 | 0.762626 | 1.649473 | -0.808793 |
10 | -0.629731 | -0.992587 | -0.062308 | 1.424108 | -2.229672 |
11 | 0.290273 | 0.477398 | -0.734068 | -1.250931 | 0.254886 |
#rajouter la cible au dataset
coord['disease'] = df.disease
#pairplot
import seaborn as sns
sns.pairplot(data=coord,hue='disease')
<seaborn.axisgrid.PairGrid at 0x1a27f61db40>
#analyse discriminante
mixdisc.lda_model_
LDA(features_labels=Index(['Z1', 'Z2', 'Z3', 'Z4', 'Z5'], dtype='object'), row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LDA(features_labels=Index(['Z1', 'Z2', 'Z3', 'Z4', 'Z5'], dtype='object'), row_labels=RangeIndex(start=0, stop=209, step=1), target=['disease'])
#coefficients de l'hyperplan séparateur
mixdisc.lda_coef_
negative | positive | |
---|---|---|
Z1 | -0.561837 | 0.714510 |
Z2 | -0.199247 | 0.253390 |
Z3 | -0.019332 | 0.024585 |
Z4 | 0.467886 | -0.595030 |
Z5 | -0.143060 | 0.181935 |
#intercept
mixdisc.lda_intercept_
negative | positive | |
---|---|---|
Intercept | -0.877854 | -1.302011 |