Importation et préparation des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#importation des données
import pandas
df = pandas.read_excel("working_condition.xls")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CLASSE         432 non-null    object 
 1   profdev        432 non-null    float64
 2   conflict       432 non-null    float64
 3   regulat        432 non-null    float64
 4   jobvar         432 non-null    float64
 5   workgrp        432 non-null    float64
 6   standrds       432 non-null    float64
 7   SAMPLE_STATUS  432 non-null    object 
dtypes: float64(6), object(2)
memory usage: 27.1+ KB
In [ ]:
#partition train-test -- utilisation de la colonne SAMPLE_STATUS
#effectifs
df.SAMPLE_STATUS.value_counts()
Out[ ]:
train    232
test     200
Name: SAMPLE_STATUS, dtype: int64
In [ ]:
#éch. d'apprentissage + evacuer la variable SAMPLE_STATUS
dfTrain = df.loc[df.SAMPLE_STATUS=="train"][df.columns[:-1]]
dfTrain.shape
Out[ ]:
(232, 7)
In [ ]:
#éch. test
dfTest = df.loc[df.SAMPLE_STATUS=="test"][df.columns[:-1]]
dfTest.shape
Out[ ]:
(200, 7)

Analyse discriminante prédictive¶

Apprentissage - Fonctions discriminantes¶

In [ ]:
#lda de scientisttools - spécifications
from scientisttools.discriminant_analysis import LDA
adl = LDA(features_labels=list(dfTrain.columns[1:]),target=['CLASSE'],row_labels=dfTrain.index)
In [ ]:
#entraînement sur les données d'apprentissage
adl.fit(dfTrain)
Out[ ]:
LDA(features_labels=['profdev', 'conflict', 'regulat', 'jobvar', 'workgrp',
                     'standrds'],
    row_labels=Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            222, 223, 224, 225, 226, 227, 228, 229, 230, 231],
           dtype='int64', length=232),
    target=['CLASSE'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LDA(features_labels=['profdev', 'conflict', 'regulat', 'jobvar', 'workgrp',
                     'standrds'],
    row_labels=Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            222, 223, 224, 225, 226, 227, 228, 229, 230, 231],
           dtype='int64', length=232),
    target=['CLASSE'])
In [ ]:
#informations générales
adl.summary_information_
Out[ ]:
value
Total Sample Size 232
Variables 6
Classes 2
DF Total 231
DF Within Classes 230
DF Between Classes 1
In [ ]:
#distribution des classes
adl.priors_
Out[ ]:
CLASSE
good      0.62069
poor      0.37931
dtype: float64
In [ ]:
#coefficients - fonctions discriminantes linéaires
#terminologie SAS
adl.coef_
Out[ ]:
good poor
profdev 0.854752 -0.097835
conflict 2.848816 3.029632
regulat 4.947790 5.437828
jobvar 0.533710 0.641023
workgrp 5.865712 4.935636
standrds 5.587691 5.801884
In [ ]:
#constantes
adl.intercept_
Out[ ]:
good poor
Intercept -51.646172 -47.38139

Inspection des variables¶

In [ ]:
#contributions des variables
adl.statistical_evaluation_
Out[ ]:
Wilks L. Partial L. F(1, 225) p-value
profdev 0.704069 0.946807 12.640948 0.000460
conflict 0.668024 0.997894 0.474834 0.491482
regulat 0.691957 0.963379 8.552823 0.003803
jobvar 0.667397 0.998832 0.263134 0.608479
workgrp 0.734287 0.907843 22.840106 0.000003
standrds 0.669898 0.995102 1.107397 0.293777
In [ ]:
#ex. les variables pertinentes à 1%
adl.statistical_evaluation_.loc[adl.statistical_evaluation_['p-value']<=0.01]
Out[ ]:
Wilks L. Partial L. F(1, 225) p-value
profdev 0.704069 0.946807 12.640948 0.000460
regulat 0.691957 0.963379 8.552823 0.003803
workgrp 0.734287 0.907843 22.840106 0.000003

Plus loin à l'aide des matrices de VAR-COVAR¶

In [ ]:
#plus loin avec les stats
#matrice de vcov-intra
adl.wcov_
Out[ ]:
profdev conflict regulat jobvar workgrp standrds
profdev 0.993181 0.759772 -0.079565 0.583546 0.254767 0.164452
conflict 0.759772 1.024159 -0.175415 0.512132 0.329523 0.068012
regulat -0.079565 -0.175415 1.077627 -0.144406 -0.277869 0.060229
jobvar 0.583546 0.512132 -0.144406 1.068645 0.328425 0.269218
workgrp 0.254767 0.329523 -0.277869 0.328425 0.857176 0.087267
standrds 0.164452 0.068012 0.060229 0.269218 0.087267 0.763119
In [ ]:
#matrice de vcov-totale
adl.tcov_
Out[ ]:
profdev conflict regulat jobvar workgrp standrds
profdev 1.219137 0.957495 -0.272420 0.736690 0.501598 0.164595
conflict 0.957495 1.195208 -0.343318 0.645814 0.544542 0.068464
regulat -0.272420 -0.343318 1.235069 -0.274399 -0.484699 0.059251
jobvar 0.736690 0.645814 -0.274399 1.169264 0.494625 0.268630
workgrp 0.501598 0.544542 -0.484699 0.494625 1.120436 0.087809
standrds 0.164595 0.068464 0.059251 0.268630 0.087809 0.759819
In [ ]:
#n et K
n = adl.n_samples_
K = adl.n_classes_
In [ ]:
#Wb - livre, page 60
#estimations corrigées des matrices
#pour le calcul du Lambda de Wilks
wb = (n-K)/n*adl.wcov_.values

#Vb
vb = (n-1)/n*adl.tcov_.values

#lambda de Wilks
import numpy
wilks = numpy.linalg.det(wb)/numpy.linalg.det(vb)
wilks
Out[ ]:
0.666617467385988

Evaluation sur l'échantillon test¶

In [ ]:
#prédiction en test
pred = adl.predict(dfTest[dfTest.columns[1:]])
pred.predict.value_counts()
Out[ ]:
good    133
poor     67
Name: predict, dtype: int64
In [ ]:
#matrice de confusion
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(dfTest.CLASSE, pred.predict)
Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1e76564fe20>
In [ ]:
#bilan classement
print(metrics.classification_report(dfTest.CLASSE, pred.predict))
              precision    recall  f1-score   support

        good       0.82      0.83      0.83       131
        poor       0.67      0.65      0.66        69

    accuracy                           0.77       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.77      0.77      0.77       200

In [ ]:
#plus précisément pour l'accuracy
acc = metrics.accuracy_score(dfTest.CLASSE, pred.predict)
print(acc)
0.77
In [ ]:
#intervalle de confiance à 90%
from statsmodels.stats.proportion import proportion_confint
proportion_confint(count=acc*dfTest.shape[0],nobs=dfTest.shape[0],alpha=0.1)
Out[ ]:
(0.7210535115804874, 0.8189464884195127)

Sélection de variables (backward)¶

Stepwise - Backward¶

In [ ]:
#sélection de variables
from scientisttools.discriminant_analysis import STEPDISC
stepdisc = STEPDISC(method="backward",alpha=0.01,model_train=True,verbose=True)
stepdisc.fit(adl)
          Wilks L.  Partial L.          F   p-value
profdev   0.704069    0.946807  12.640948  0.000460
conflict  0.668024    0.997894   0.474834  0.491482
regulat   0.691957    0.963379   8.552823  0.003803
jobvar    0.667397    0.998832   0.263134  0.608479
workgrp   0.734287    0.907843  22.840106  0.000003
standrds  0.669898    0.995102   1.107397  0.293777

          Wilks L.  Partial L.          F   p-value
profdev   0.705277    0.946291  12.827184  0.000418
conflict  0.669015    0.997582   0.547783  0.459993
regulat   0.692297    0.964033   8.431801  0.004053
workgrp   0.734639    0.908470  22.770028  0.000003
standrds  0.671784    0.993469   1.485672  0.224160

          Wilks L.  Partial L.          F   p-value
profdev   0.729807    0.916701  20.627184  0.000009
regulat   0.692968    0.965433   8.127630  0.004761
workgrp   0.734748    0.910536  22.303702  0.000004
standrds  0.672906    0.994217   1.320430  0.251724

         Wilks L.  Partial L.          F   p-value
profdev  0.730442    0.921232  19.494742  0.000016
regulat  0.699619    0.961818   9.051172  0.002920
workgrp  0.736816    0.913262  21.654410  0.000006

Out[ ]:
STEPDISC(method='backward', model_train=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
STEPDISC(method='backward', model_train=True)
In [ ]:
#modèle avec les variables sélectionnées
adl_stepdisc = stepdisc.train_model_

#coefficients
adl_stepdisc.coef_
Out[ ]:
good poor
profdev 4.035209 3.302228
regulat 5.220522 5.712405
workgrp 6.877362 6.015048
In [ ]:
#intercept
adl_stepdisc.intercept_
Out[ ]:
good poor
Intercept -38.93664 -33.485441

Evaluation du modèle simplifié en test¶

In [ ]:
#liste des var. sélectionnées
adl_stepdisc.features_labels_
Out[ ]:
['profdev', 'regulat', 'workgrp']
In [ ]:
#prédiction en test à partir des var. sélectionnées + accuracy
predSel = adl_stepdisc.predict(dfTest[adl_stepdisc.features_labels_])
metrics.accuracy_score(dfTest.CLASSE,predSel.predict)
Out[ ]:
0.765