#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation des données
import pandas
df = pandas.read_excel("mushroom_disqual.xls")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8124 entries, 0 to 8123 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 classe 8124 non-null object 1 capshape 8124 non-null object 2 capcolor 8124 non-null object 3 gillsize 8124 non-null object 4 gillcolor 8124 non-null object 5 stalksurfaceabove 8124 non-null object 6 sporeprintcolor 8124 non-null object 7 SAMPLE_STATUS 8124 non-null object dtypes: object(8) memory usage: 507.9+ KB
#effectifs apprentissage-test
df.SAMPLE_STATUS.value_counts()
SAMPLE_STATUS train 4124 test 4000 Name: count, dtype: int64
#partition app-test basé sur la colonne SAMPLE_STATUS
#évacuer cette dernière colonne ensuite
dfTrain = df.loc[df.SAMPLE_STATUS == "train"][df.columns[:-1]]
dfTest = df.loc[df.SAMPLE_STATUS == "test"][df.columns[:-1]]
#dimensions
print(dfTrain.shape)
print(dfTest.shape)
(4124, 7) (4000, 7)
ACM sur les descripteurs.
#instancier et lancer l'ACM
from scientisttools.decomposition import MCA
my_mca = MCA(n_components=None,
row_labels=dfTrain.index.values,
var_labels=dfTrain.columns[1:].values,
mod_labels=None,
matrix_type='completed',
benzecri=True,
greenacre=True,
row_sup_labels=None,
quali_sup_labels=None,
quanti_sup_labels=None,
graph=False)
#entraînement
my_mca.fit(dfTrain[dfTrain.columns[1:]])
#valeurs propres
from scientisttools.extractfactor import get_eig
print(get_eig(my_mca))
eigenvalue difference proportion cumulative Dim.1 0.540128 0.124468 8.758835 8.758835 Dim.2 0.415660 0.102818 6.740432 15.499267 Dim.3 0.312842 0.013153 5.073117 20.572385 Dim.4 0.299689 0.063226 4.859826 25.432210 Dim.5 0.236463 0.012930 3.834541 29.266751 Dim.6 0.223533 0.006356 3.624864 32.891615 Dim.7 0.217178 0.017737 3.521797 36.413413 Dim.8 0.199441 0.009523 3.234171 39.647584 Dim.9 0.189918 0.004474 3.079746 42.727329 Dim.10 0.185443 0.005023 3.007191 45.734520 Dim.11 0.180421 0.000790 2.925745 48.660265 Dim.12 0.179631 0.005727 2.912933 51.573197 Dim.13 0.173903 0.002198 2.820056 54.393253 Dim.14 0.171705 0.001141 2.784411 57.177665 Dim.15 0.170565 0.003183 2.765914 59.943578 Dim.16 0.167381 0.001120 2.714292 62.657870 Dim.17 0.166262 0.002904 2.696133 65.354003 Dim.18 0.163358 0.000757 2.649047 68.003049 Dim.19 0.162601 0.006495 2.636774 70.639823 Dim.20 0.156106 0.001283 2.531447 73.171271 Dim.21 0.154823 0.003272 2.510646 75.681917 Dim.22 0.151551 0.005462 2.457589 78.139507 Dim.23 0.146089 0.005772 2.369018 80.508525 Dim.24 0.140318 0.004332 2.275423 82.783947 Dim.25 0.135986 0.006114 2.205180 84.989127 Dim.26 0.129872 0.004469 2.106027 87.095154 Dim.27 0.125402 0.011935 2.033551 89.128704 Dim.28 0.113467 0.001877 1.840011 90.968715 Dim.29 0.111590 0.008509 1.809570 92.778285 Dim.30 0.103081 0.021239 1.671592 94.449876 Dim.31 0.081842 0.018393 1.327173 95.777049 Dim.32 0.063449 0.005116 1.028905 96.805954 Dim.33 0.058333 0.005597 0.945938 97.751892 Dim.34 0.052736 0.016989 0.855173 98.607065 Dim.35 0.035747 0.000927 0.579675 99.186740 Dim.36 0.034820 0.019489 0.564649 99.751388 Dim.37 0.015331 0.015331 0.248612 100.000000
Corrections de Benzécri et de Greenacre : "scientisttools" se démarque des autres librairies ici, y compris par rapport aux librairies qui font référence sous R.
#application des corrections de Benzécri et Greenacre
print(my_mca.greenacre_correction_)
eigenvalue proportion cumulative Dim.1 2.008418e-01 45.341261 45.341261 Dim.2 8.927664e-02 20.154742 65.496003 Dim.3 3.076891e-02 6.946267 72.442270 Dim.4 2.548081e-02 5.752447 78.194717 Dim.5 7.015070e-03 1.583695 79.778412 Dim.6 4.656692e-03 1.051276 80.829688 Dim.7 3.673936e-03 0.829413 81.659102 Dim.8 1.546742e-03 0.349186 82.008288 Dim.9 7.784756e-04 0.175746 82.184034 Dim.10 5.076969e-04 0.114616 82.298649 Dim.11 2.724181e-04 0.061500 82.360149 Dim.12 2.420205e-04 0.054638 82.414787 Dim.13 7.541481e-05 0.017025 82.431812 Dim.14 3.655917e-05 0.008253 82.440066 Dim.15 2.188007e-05 0.004940 82.445005 Dim.16 7.354486e-07 0.000166 82.445171
#graphique - évolution de la variance restituée
#en fonction du nombre de facteurs
#arpès corrections de Benzécri et Greenacre
import seaborn as sns
import numpy
sns.lineplot(x=numpy.arange(1,17,1),y=my_mca.greenacre_correction_['cumulative'],marker='.')
<Axes: ylabel='cumulative'>
#coordonnées des individus dans l'espace factoriel
coordTrain = my_mca.transform(dfTrain.iloc[:,1:])
coordTrain.shape
(4124, 37)
#graphique représentation dans le premier plan factoriel
#projection des individus dans l'espace factoriel
import matplotlib.pyplot as plt
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',c='grey',s=15)
plt.title("Plan factoriel - ACM")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()
#rappel - classes d'appartenance
dfTrain.classe.value_counts()
classe edible 2160 poisonous 1964 Name: count, dtype: int64
#graphique avec classes d'appartenance
import numpy
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',s=15,
c=numpy.array(['violet','orangered'])[1*(dfTrain.classe=='poisonous')])
plt.title("Plan factoriel ACM + Classes d'appartenance")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()
#nouveau data frame avec les 2 facteurs
dfFactTrain = pandas.DataFrame(coordTrain[:,:2],columns=['F1','F2'],index=dfTrain.index)
#ajouter la classe
dfFactTrain['classe'] = dfTrain.classe
#info
dfFactTrain.info()
<class 'pandas.core.frame.DataFrame'> Index: 4124 entries, 0 to 4123 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 F1 4124 non-null float64 1 F2 4124 non-null float64 2 classe 4124 non-null object dtypes: float64(2), object(1) memory usage: 128.9+ KB
#analyse discriminante
from scientisttools.discriminant_analysis import LDA
lda = LDA(features_labels=['F1','F2'],target=['classe'],row_labels=dfFactTrain.index)
#entraînement
lda.fit(dfFactTrain)
LDA(features_labels=['F1', 'F2'], row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LDA(features_labels=['F1', 'F2'], row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])
#coefficients des fonctions discriminantes
lda.coef_
edible | poisonous | |
---|---|---|
F1 | 1.206881 | -1.327324 |
F2 | -2.253131 | 2.477985 |
#intercept
lda.intercept_
edible | poisonous | |
---|---|---|
Intercept | -1.262199 | -1.4863 |
#on peut en déduire l'équation implicte de la frontière
c1 = lda.coef_['poisonous']['F1'] - lda.coef_['edible']['F1']
c2 = lda.coef_['poisonous']['F2'] - lda.coef_['edible']['F2']
c0 = lda.intercept_['poisonous']['Intercept'] - lda.intercept_['edible']['Intercept']
#soit
print(f"{c1} x F1 + {c2} x F2 + {c0} = 0")
-2.5342047717164795 x F1 + 4.731115765604964 x F2 + -0.22410079340786226 = 0
#et donc son équation explicite
a = -c1/c2
b = -c0/c2
#soit
print(f"F2 = {a} x F1 + {b}")
F2 = 0.5356463247295815 x F1 + 0.04736742969535151
#frontière - droite de séparation ici puisque nous sommes dans le plan
#que l'on peut matérialiser
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',s=15,
c=numpy.array(['violet','orangered'])[1*(dfTrain.classe=='poisonous')])
plt.plot([-1.5,1.0],[a*(-1.5)+b,a*(1.0)+b])
plt.title("Plan factoriel ACM + Classes d'appartenance")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()
Projection des individus de l'échantillon test dans l'espace factoriel.
#projection sur échantillon test
coordTest = my_mca.transform(dfTest[dfTest.columns[1:]])
dfFactTest = pandas.DataFrame(coordTest[:,:2],columns=['F1','F2'],index=dfTest.index)
#affichage
dfFactTest.head()
F1 | F2 | |
---|---|---|
4124 | 0.634231 | -0.122952 |
4125 | -1.206404 | 0.487559 |
4126 | 0.264645 | -0.707573 |
4127 | -1.631717 | 0.547349 |
4128 | 0.079083 | -0.715910 |
Prédiction de l'analyse discriminante à partir de l'espace factoriel.
#prédiction à partir des coordonnées factorielles
predTest = lda.predict(dfFactTest)
#fréquences des classes prédites
predTest.predict.value_counts()
predict edible 2239 poisonous 1761 Name: count, dtype: int64
#évaluation - matrice de confusion
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(dfTest.classe,predTest.predict)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1ce5da450c0>
#bilan
print(metrics.classification_report(dfTest.classe,predTest.predict))
precision recall f1-score support edible 0.85 0.93 0.89 2048 poisonous 0.92 0.83 0.87 1952 accuracy 0.88 4000 macro avg 0.88 0.88 0.88 4000 weighted avg 0.88 0.88 0.88 4000
#méthode DISQUAL instanciation
from scientisttools.discriminant_analysis import DISQUAL
disqual = DISQUAL(n_components=2,target=['classe'],features_labels=list(dfTrain.columns[1:]),row_labels=dfTrain.index)
#entraînement
disqual.fit(dfTrain)
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor', 'stalksurfaceabove', 'sporeprintcolor'], n_components=2, row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor', 'stalksurfaceabove', 'sporeprintcolor'], n_components=2, row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])
#coefficients
disqual.coef_
edible | poisonous | |
---|---|---|
capshape_b | 0.640284 | -0.704182 |
capshape_c | 0.042620 | -0.046873 |
capshape_f | -0.010982 | 0.012078 |
capshape_k | -0.586159 | 0.644655 |
capshape_s | 0.624820 | -0.687175 |
capshape_x | 0.061858 | -0.068031 |
capcolor_b | 0.056840 | -0.062512 |
capcolor_c | -0.196299 | 0.215889 |
capcolor_e | -0.271719 | 0.298836 |
capcolor_g | -0.041820 | 0.045993 |
capcolor_n | 0.147669 | -0.162405 |
capcolor_p | 0.456480 | -0.502035 |
capcolor_r | -0.273811 | 0.301136 |
capcolor_u | -0.458444 | 0.504195 |
capcolor_w | 0.538635 | -0.592388 |
capcolor_y | -0.408486 | 0.449251 |
gillsize_b | 0.161651 | -0.177783 |
gillsize_n | -0.363269 | 0.399522 |
gillcolor_b | -0.674406 | 0.741710 |
gillcolor_e | 0.023658 | -0.026018 |
gillcolor_g | -0.431000 | 0.474013 |
gillcolor_h | -0.581091 | 0.639082 |
gillcolor_k | 0.741417 | -0.815408 |
gillcolor_n | 0.673835 | -0.741081 |
gillcolor_o | 1.063433 | -1.169560 |
gillcolor_p | 0.018935 | -0.020825 |
gillcolor_r | 1.322048 | -1.453984 |
gillcolor_u | 0.584738 | -0.643093 |
gillcolor_w | 0.315198 | -0.346654 |
gillcolor_y | 0.597601 | -0.657239 |
stalksurfaceabove_f | 0.378028 | -0.415754 |
stalksurfaceabove_k | -0.808545 | 0.889235 |
stalksurfaceabove_s | 0.328371 | -0.361142 |
stalksurfaceabove_y | -0.044378 | 0.048807 |
sporeprintcolor_b | 1.005598 | -1.105953 |
sporeprintcolor_h | -0.686066 | 0.754533 |
sporeprintcolor_k | 0.541894 | -0.595973 |
sporeprintcolor_n | 0.540265 | -0.594181 |
sporeprintcolor_o | 0.947164 | -1.041688 |
sporeprintcolor_r | 0.937589 | -1.031157 |
sporeprintcolor_u | 0.243253 | -0.267529 |
sporeprintcolor_w | -0.498439 | 0.548181 |
sporeprintcolor_y | 0.952978 | -1.048082 |
#intercept
disqual.intercept_
edible | poisonous | |
---|---|---|
Intercept | -1.262199 | -1.4863 |
#prédiction test
predTestDisqual = disqual.predict(dfTest.iloc[:,1:])
#comptage
predTestDisqual.predict.value_counts()
predict edible 2239 poisonous 1761 Name: count, dtype: int64
#évaluation
print(metrics.classification_report(dfTest.classe,predTestDisqual.predict))
precision recall f1-score support edible 0.85 0.93 0.89 2048 poisonous 0.92 0.83 0.87 1952 accuracy 0.88 4000 macro avg 0.88 0.88 0.88 4000 weighted avg 0.88 0.88 0.88 4000
#partition du training set en learning/validation
from sklearn.model_selection import train_test_split
dfLearning, dfValidation = train_test_split(dfTrain,train_size=0.7,random_state=0,stratify=dfTrain.classe)
#dim.
print(dfLearning.shape)
print(dfValidation.shape)
(2886, 7) (1238, 7)
#nombre max d'axes
max_components = 15
#accuracy en resubstitution
acc_resub = numpy.zeros(max_components-1)
#accuracy sur validation set
acc_valid = numpy.zeros(max_components-1)
#itérer
for k in range(2,max_components+1):
#disqual avec k components
dq = DISQUAL(n_components=k,target=['classe'],features_labels=list(dfLearning.columns[1:]),row_labels=dfLearning.index)
dq.fit(dfLearning)
#accuracy en resubstitution
acc_resub[k-2] = metrics.accuracy_score(dfLearning.classe, dq.predict(dfLearning.iloc[:,1:]).predict)
#accuracy sur validation set
acc_valid[k-2] = metrics.accuracy_score(dfValidation.classe, dq.predict(dfValidation.iloc[:,1:]).predict)
#résultats
print(acc_resub)
print(acc_valid)
[0.87422037 0.87318087 0.87318087 0.87525988 0.87872488 0.88877339 0.89050589 0.89258489 0.89223839 0.89327789 0.8953569 0.89431739 0.8970894 0.89258489] [0.8723748 0.86833603 0.86752827 0.86268174 0.86752827 0.88449111 0.88610662 0.88933764 0.88610662 0.88691438 0.88772213 0.88772213 0.88852989 0.88691438]
#graphique - paramètre de régularisation = nombre de composantes
import matplotlib.pyplot as plt
plt.plot(numpy.arange(2,max_components+1,1),acc_resub,marker='.',label='Resub.')
plt.plot(numpy.arange(2,max_components+1,1),acc_valid,marker='.',label='Valid. set')
plt.xlabel("# components")
plt.ylabel('Accuracy')
plt.legend()
plt.show()
#apprentissage avec 9 composantes
#on utilise la totalité du training set
dqOpt = DISQUAL(n_components=9,target=['classe'],features_labels=list(dfTrain.columns[1:]),row_labels=dfTrain.index)
#entraînement
dqOpt.fit(dfTrain)
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor', 'stalksurfaceabove', 'sporeprintcolor'], n_components=9, row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor', 'stalksurfaceabove', 'sporeprintcolor'], n_components=9, row_labels=Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123], dtype='int64', length=4124), target=['classe'])
#évaluation en test - rapport
print(metrics.classification_report(dfTest.classe,dqOpt.predict(dfTest.iloc[:,1:]).predict))
precision recall f1-score support edible 0.86 0.95 0.90 2048 poisonous 0.94 0.84 0.89 1952 accuracy 0.90 4000 macro avg 0.90 0.89 0.89 4000 weighted avg 0.90 0.90 0.89 4000