Importation et préparation des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation des données
import pandas
df = pandas.read_excel("mushroom_disqual.xls")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   classe             8124 non-null   object
 1   capshape           8124 non-null   object
 2   capcolor           8124 non-null   object
 3   gillsize           8124 non-null   object
 4   gillcolor          8124 non-null   object
 5   stalksurfaceabove  8124 non-null   object
 6   sporeprintcolor    8124 non-null   object
 7   SAMPLE_STATUS      8124 non-null   object
dtypes: object(8)
memory usage: 507.9+ KB
In [ ]:
#effectifs apprentissage-test
df.SAMPLE_STATUS.value_counts()
Out[ ]:
SAMPLE_STATUS
train    4124
test     4000
Name: count, dtype: int64
In [ ]:
#partition app-test basé sur la colonne SAMPLE_STATUS
#évacuer cette dernière colonne ensuite
dfTrain = df.loc[df.SAMPLE_STATUS == "train"][df.columns[:-1]]
dfTest = df.loc[df.SAMPLE_STATUS == "test"][df.columns[:-1]]

#dimensions
print(dfTrain.shape)
print(dfTest.shape)
(4124, 7)
(4000, 7)

La méthode DISQUAL pas-à-pas¶

ACM - Analyse des correspondances multiples¶

ACM sur les descripteurs.

In [ ]:
#instancier et lancer l'ACM
from scientisttools.decomposition import MCA
my_mca = MCA(n_components=None,
              row_labels=dfTrain.index.values,
              var_labels=dfTrain.columns[1:].values,
              mod_labels=None,
              matrix_type='completed',
              benzecri=True,
              greenacre=True,
              row_sup_labels=None,
              quali_sup_labels=None,
              quanti_sup_labels=None,
              graph=False)

#entraînement
my_mca.fit(dfTrain[dfTrain.columns[1:]])

#valeurs propres
from scientisttools.extractfactor import get_eig
print(get_eig(my_mca))
        eigenvalue  difference  proportion  cumulative
Dim.1     0.540128    0.124468    8.758835    8.758835
Dim.2     0.415660    0.102818    6.740432   15.499267
Dim.3     0.312842    0.013153    5.073117   20.572385
Dim.4     0.299689    0.063226    4.859826   25.432210
Dim.5     0.236463    0.012930    3.834541   29.266751
Dim.6     0.223533    0.006356    3.624864   32.891615
Dim.7     0.217178    0.017737    3.521797   36.413413
Dim.8     0.199441    0.009523    3.234171   39.647584
Dim.9     0.189918    0.004474    3.079746   42.727329
Dim.10    0.185443    0.005023    3.007191   45.734520
Dim.11    0.180421    0.000790    2.925745   48.660265
Dim.12    0.179631    0.005727    2.912933   51.573197
Dim.13    0.173903    0.002198    2.820056   54.393253
Dim.14    0.171705    0.001141    2.784411   57.177665
Dim.15    0.170565    0.003183    2.765914   59.943578
Dim.16    0.167381    0.001120    2.714292   62.657870
Dim.17    0.166262    0.002904    2.696133   65.354003
Dim.18    0.163358    0.000757    2.649047   68.003049
Dim.19    0.162601    0.006495    2.636774   70.639823
Dim.20    0.156106    0.001283    2.531447   73.171271
Dim.21    0.154823    0.003272    2.510646   75.681917
Dim.22    0.151551    0.005462    2.457589   78.139507
Dim.23    0.146089    0.005772    2.369018   80.508525
Dim.24    0.140318    0.004332    2.275423   82.783947
Dim.25    0.135986    0.006114    2.205180   84.989127
Dim.26    0.129872    0.004469    2.106027   87.095154
Dim.27    0.125402    0.011935    2.033551   89.128704
Dim.28    0.113467    0.001877    1.840011   90.968715
Dim.29    0.111590    0.008509    1.809570   92.778285
Dim.30    0.103081    0.021239    1.671592   94.449876
Dim.31    0.081842    0.018393    1.327173   95.777049
Dim.32    0.063449    0.005116    1.028905   96.805954
Dim.33    0.058333    0.005597    0.945938   97.751892
Dim.34    0.052736    0.016989    0.855173   98.607065
Dim.35    0.035747    0.000927    0.579675   99.186740
Dim.36    0.034820    0.019489    0.564649   99.751388
Dim.37    0.015331    0.015331    0.248612  100.000000

Corrections de Benzécri et de Greenacre : "scientisttools" se démarque des autres librairies ici, y compris par rapport aux librairies qui font référence sous R.

In [ ]:
#application des corrections de Benzécri et Greenacre
print(my_mca.greenacre_correction_)
          eigenvalue  proportion  cumulative
Dim.1   2.008418e-01   45.341261   45.341261
Dim.2   8.927664e-02   20.154742   65.496003
Dim.3   3.076891e-02    6.946267   72.442270
Dim.4   2.548081e-02    5.752447   78.194717
Dim.5   7.015070e-03    1.583695   79.778412
Dim.6   4.656692e-03    1.051276   80.829688
Dim.7   3.673936e-03    0.829413   81.659102
Dim.8   1.546742e-03    0.349186   82.008288
Dim.9   7.784756e-04    0.175746   82.184034
Dim.10  5.076969e-04    0.114616   82.298649
Dim.11  2.724181e-04    0.061500   82.360149
Dim.12  2.420205e-04    0.054638   82.414787
Dim.13  7.541481e-05    0.017025   82.431812
Dim.14  3.655917e-05    0.008253   82.440066
Dim.15  2.188007e-05    0.004940   82.445005
Dim.16  7.354486e-07    0.000166   82.445171
In [ ]:
#graphique - évolution de la variance restituée
#en fonction du nombre de facteurs
#arpès corrections de Benzécri et Greenacre
import seaborn as sns
import numpy
sns.lineplot(x=numpy.arange(1,17,1),y=my_mca.greenacre_correction_['cumulative'],marker='.')
Out[ ]:
<Axes: ylabel='cumulative'>
In [ ]:
#coordonnées des individus dans l'espace factoriel
coordTrain = my_mca.transform(dfTrain.iloc[:,1:])
coordTrain.shape
Out[ ]:
(4124, 37)
In [ ]:
#graphique représentation dans le premier plan factoriel
#projection des individus dans l'espace factoriel
import matplotlib.pyplot as plt
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',c='grey',s=15)
plt.title("Plan factoriel - ACM")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()

Analyse discriminante linéaire dans l'espace factoriel¶

In [ ]:
#rappel - classes d'appartenance
dfTrain.classe.value_counts()
Out[ ]:
classe
edible       2160
poisonous    1964
Name: count, dtype: int64
In [ ]:
#graphique avec classes d'appartenance
import numpy
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',s=15,
            c=numpy.array(['violet','orangered'])[1*(dfTrain.classe=='poisonous')])
plt.title("Plan factoriel ACM + Classes d'appartenance")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()
In [ ]:
#nouveau data frame avec les 2 facteurs
dfFactTrain = pandas.DataFrame(coordTrain[:,:2],columns=['F1','F2'],index=dfTrain.index)

#ajouter la classe
dfFactTrain['classe'] = dfTrain.classe

#info
dfFactTrain.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4124 entries, 0 to 4123
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F1      4124 non-null   float64
 1   F2      4124 non-null   float64
 2   classe  4124 non-null   object 
dtypes: float64(2), object(1)
memory usage: 128.9+ KB
In [ ]:
#analyse discriminante
from scientisttools.discriminant_analysis import LDA
lda = LDA(features_labels=['F1','F2'],target=['classe'],row_labels=dfFactTrain.index)

#entraînement
lda.fit(dfFactTrain)
Out[ ]:
LDA(features_labels=['F1', 'F2'],
    row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
    target=['classe'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LDA(features_labels=['F1', 'F2'],
    row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
    target=['classe'])
In [ ]:
#coefficients des fonctions discriminantes
lda.coef_
Out[ ]:
edible poisonous
F1 1.206881 -1.327324
F2 -2.253131 2.477985
In [ ]:
#intercept
lda.intercept_
Out[ ]:
edible poisonous
Intercept -1.262199 -1.4863
In [ ]:
#on peut en déduire l'équation implicte de la frontière
c1 = lda.coef_['poisonous']['F1'] - lda.coef_['edible']['F1']
c2 = lda.coef_['poisonous']['F2'] - lda.coef_['edible']['F2']
c0 = lda.intercept_['poisonous']['Intercept'] - lda.intercept_['edible']['Intercept']

#soit
print(f"{c1} x F1 + {c2} x F2 + {c0} = 0")
-2.5342047717164795 x F1 + 4.731115765604964 x F2 + -0.22410079340786226 = 0
In [ ]:
#et donc son équation explicite
a = -c1/c2
b = -c0/c2

#soit
print(f"F2 = {a} x F1 + {b}")
F2 = 0.5356463247295815 x F1 + 0.04736742969535151
In [ ]:
#frontière - droite de séparation ici puisque nous sommes dans le plan
#que l'on peut matérialiser
plt.scatter(coordTrain[:,0], coordTrain[:,1], marker='.',s=15,
            c=numpy.array(['violet','orangered'])[1*(dfTrain.classe=='poisonous')])
plt.plot([-1.5,1.0],[a*(-1.5)+b,a*(1.0)+b])
plt.title("Plan factoriel ACM + Classes d'appartenance")
plt.xlabel('Fact 1')
plt.ylabel('Fact 2')
plt.show()

Evaluation sur l'échantillon test¶

Projection des individus de l'échantillon test dans l'espace factoriel.

In [ ]:
#projection sur échantillon test
coordTest = my_mca.transform(dfTest[dfTest.columns[1:]])
dfFactTest = pandas.DataFrame(coordTest[:,:2],columns=['F1','F2'],index=dfTest.index)

#affichage
dfFactTest.head()
Out[ ]:
F1 F2
4124 0.634231 -0.122952
4125 -1.206404 0.487559
4126 0.264645 -0.707573
4127 -1.631717 0.547349
4128 0.079083 -0.715910

Prédiction de l'analyse discriminante à partir de l'espace factoriel.

In [ ]:
#prédiction à partir des coordonnées factorielles
predTest = lda.predict(dfFactTest)

#fréquences des classes prédites
predTest.predict.value_counts()
Out[ ]:
predict
edible       2239
poisonous    1761
Name: count, dtype: int64
In [ ]:
#évaluation - matrice de confusion
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(dfTest.classe,predTest.predict)
Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1ce5da450c0>
In [ ]:
#bilan
print(metrics.classification_report(dfTest.classe,predTest.predict))
              precision    recall  f1-score   support

      edible       0.85      0.93      0.89      2048
   poisonous       0.92      0.83      0.87      1952

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000

La méthode DISQUAL implémentée dans "scientisttools"¶

Modélisation et évaluation (à nombre de facteurs fixé)¶

In [ ]:
#méthode DISQUAL instanciation
from scientisttools.discriminant_analysis import DISQUAL
disqual = DISQUAL(n_components=2,target=['classe'],features_labels=list(dfTrain.columns[1:]),row_labels=dfTrain.index)

#entraînement
disqual.fit(dfTrain)
Out[ ]:
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor',
                         'stalksurfaceabove', 'sporeprintcolor'],
        n_components=2,
        row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
        target=['classe'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor',
                         'stalksurfaceabove', 'sporeprintcolor'],
        n_components=2,
        row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
        target=['classe'])
In [ ]:
#coefficients
disqual.coef_
Out[ ]:
edible poisonous
capshape_b 0.640284 -0.704182
capshape_c 0.042620 -0.046873
capshape_f -0.010982 0.012078
capshape_k -0.586159 0.644655
capshape_s 0.624820 -0.687175
capshape_x 0.061858 -0.068031
capcolor_b 0.056840 -0.062512
capcolor_c -0.196299 0.215889
capcolor_e -0.271719 0.298836
capcolor_g -0.041820 0.045993
capcolor_n 0.147669 -0.162405
capcolor_p 0.456480 -0.502035
capcolor_r -0.273811 0.301136
capcolor_u -0.458444 0.504195
capcolor_w 0.538635 -0.592388
capcolor_y -0.408486 0.449251
gillsize_b 0.161651 -0.177783
gillsize_n -0.363269 0.399522
gillcolor_b -0.674406 0.741710
gillcolor_e 0.023658 -0.026018
gillcolor_g -0.431000 0.474013
gillcolor_h -0.581091 0.639082
gillcolor_k 0.741417 -0.815408
gillcolor_n 0.673835 -0.741081
gillcolor_o 1.063433 -1.169560
gillcolor_p 0.018935 -0.020825
gillcolor_r 1.322048 -1.453984
gillcolor_u 0.584738 -0.643093
gillcolor_w 0.315198 -0.346654
gillcolor_y 0.597601 -0.657239
stalksurfaceabove_f 0.378028 -0.415754
stalksurfaceabove_k -0.808545 0.889235
stalksurfaceabove_s 0.328371 -0.361142
stalksurfaceabove_y -0.044378 0.048807
sporeprintcolor_b 1.005598 -1.105953
sporeprintcolor_h -0.686066 0.754533
sporeprintcolor_k 0.541894 -0.595973
sporeprintcolor_n 0.540265 -0.594181
sporeprintcolor_o 0.947164 -1.041688
sporeprintcolor_r 0.937589 -1.031157
sporeprintcolor_u 0.243253 -0.267529
sporeprintcolor_w -0.498439 0.548181
sporeprintcolor_y 0.952978 -1.048082
In [ ]:
#intercept
disqual.intercept_
Out[ ]:
edible poisonous
Intercept -1.262199 -1.4863
In [ ]:
#prédiction test
predTestDisqual = disqual.predict(dfTest.iloc[:,1:])

#comptage
predTestDisqual.predict.value_counts()
Out[ ]:
predict
edible       2239
poisonous    1761
Name: count, dtype: int64
In [ ]:
#évaluation
print(metrics.classification_report(dfTest.classe,predTestDisqual.predict))
              precision    recall  f1-score   support

      edible       0.85      0.93      0.89      2048
   poisonous       0.92      0.83      0.87      1952

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000

Détermination du nombre "optimal" de facteurs¶

In [ ]:
#partition du training set en learning/validation
from sklearn.model_selection import train_test_split
dfLearning, dfValidation = train_test_split(dfTrain,train_size=0.7,random_state=0,stratify=dfTrain.classe)

#dim.
print(dfLearning.shape)
print(dfValidation.shape)
(2886, 7)
(1238, 7)
In [ ]:
#nombre max d'axes
max_components = 15

#accuracy en resubstitution
acc_resub = numpy.zeros(max_components-1)

#accuracy sur validation set
acc_valid = numpy.zeros(max_components-1)

#itérer
for k in range(2,max_components+1):
    #disqual avec k components
    dq = DISQUAL(n_components=k,target=['classe'],features_labels=list(dfLearning.columns[1:]),row_labels=dfLearning.index)
    dq.fit(dfLearning)
    #accuracy en resubstitution
    acc_resub[k-2] = metrics.accuracy_score(dfLearning.classe, dq.predict(dfLearning.iloc[:,1:]).predict)
    #accuracy sur validation set
    acc_valid[k-2] = metrics.accuracy_score(dfValidation.classe, dq.predict(dfValidation.iloc[:,1:]).predict)
    
#résultats
print(acc_resub)
print(acc_valid)
[0.87422037 0.87318087 0.87318087 0.87525988 0.87872488 0.88877339
 0.89050589 0.89258489 0.89223839 0.89327789 0.8953569  0.89431739
 0.8970894  0.89258489]
[0.8723748  0.86833603 0.86752827 0.86268174 0.86752827 0.88449111
 0.88610662 0.88933764 0.88610662 0.88691438 0.88772213 0.88772213
 0.88852989 0.88691438]
In [ ]:
#graphique - paramètre de régularisation = nombre de composantes
import matplotlib.pyplot as plt
plt.plot(numpy.arange(2,max_components+1,1),acc_resub,marker='.',label='Resub.')
plt.plot(numpy.arange(2,max_components+1,1),acc_valid,marker='.',label='Valid. set')
plt.xlabel("# components")
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [ ]:
#apprentissage avec 9 composantes
#on utilise la totalité du training set
dqOpt = DISQUAL(n_components=9,target=['classe'],features_labels=list(dfTrain.columns[1:]),row_labels=dfTrain.index)

#entraînement
dqOpt.fit(dfTrain)
Out[ ]:
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor',
                         'stalksurfaceabove', 'sporeprintcolor'],
        n_components=9,
        row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
        target=['classe'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DISQUAL(features_labels=['capshape', 'capcolor', 'gillsize', 'gillcolor',
                         'stalksurfaceabove', 'sporeprintcolor'],
        n_components=9,
        row_labels=Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123],
      dtype='int64', length=4124),
        target=['classe'])
In [ ]:
#évaluation en test - rapport
print(metrics.classification_report(dfTest.classe,dqOpt.predict(dfTest.iloc[:,1:]).predict))
              precision    recall  f1-score   support

      edible       0.86      0.95      0.90      2048
   poisonous       0.94      0.84      0.89      1952

    accuracy                           0.90      4000
   macro avg       0.90      0.89      0.89      4000
weighted avg       0.90      0.90      0.89      4000