#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation des données
import pandas
df = pandas.read_excel("working_condition.xls")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 432 entries, 0 to 431 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLASSE 432 non-null object 1 profdev 432 non-null float64 2 conflict 432 non-null float64 3 regulat 432 non-null float64 4 jobvar 432 non-null float64 5 workgrp 432 non-null float64 6 standrds 432 non-null float64 7 SAMPLE_STATUS 432 non-null object dtypes: float64(6), object(2) memory usage: 27.1+ KB
#partition train-test -- utilisation de la colonne SAMPLE_STATUS
#effectifs
df.SAMPLE_STATUS.value_counts()
train 232 test 200 Name: SAMPLE_STATUS, dtype: int64
#éch. d'apprentissage + evacuer la variable SAMPLE_STATUS
dfTrain = df.loc[df.SAMPLE_STATUS=="train"][df.columns[:-1]]
dfTrain.shape
(232, 7)
#éch. test
dfTest = df.loc[df.SAMPLE_STATUS=="test"][df.columns[:-1]]
dfTest.shape
(200, 7)
#lda de scientisttools - spécifications
from scientisttools.discriminant_analysis import LDA
adl = LDA(features_labels=list(dfTrain.columns[1:]),target=['CLASSE'],row_labels=dfTrain.index)
#entraînement sur les données d'apprentissage
adl.fit(dfTrain)
LDA(features_labels=['profdev', 'conflict', 'regulat', 'jobvar', 'workgrp', 'standrds'], row_labels=Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 222, 223, 224, 225, 226, 227, 228, 229, 230, 231], dtype='int64', length=232), target=['CLASSE'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LDA(features_labels=['profdev', 'conflict', 'regulat', 'jobvar', 'workgrp', 'standrds'], row_labels=Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 222, 223, 224, 225, 226, 227, 228, 229, 230, 231], dtype='int64', length=232), target=['CLASSE'])
#informations générales
adl.summary_information_
value | |
---|---|
Total Sample Size | 232 |
Variables | 6 |
Classes | 2 |
DF Total | 231 |
DF Within Classes | 230 |
DF Between Classes | 1 |
#distribution des classes
adl.priors_
CLASSE good 0.62069 poor 0.37931 dtype: float64
#coefficients - fonctions discriminantes linéaires
#terminologie SAS
adl.coef_
good | poor | |
---|---|---|
profdev | 0.854752 | -0.097835 |
conflict | 2.848816 | 3.029632 |
regulat | 4.947790 | 5.437828 |
jobvar | 0.533710 | 0.641023 |
workgrp | 5.865712 | 4.935636 |
standrds | 5.587691 | 5.801884 |
#constantes
adl.intercept_
good | poor | |
---|---|---|
Intercept | -51.646172 | -47.38139 |
#contributions des variables
adl.statistical_evaluation_
Wilks L. | Partial L. | F(1, 225) | p-value | |
---|---|---|---|---|
profdev | 0.704069 | 0.946807 | 12.640948 | 0.000460 |
conflict | 0.668024 | 0.997894 | 0.474834 | 0.491482 |
regulat | 0.691957 | 0.963379 | 8.552823 | 0.003803 |
jobvar | 0.667397 | 0.998832 | 0.263134 | 0.608479 |
workgrp | 0.734287 | 0.907843 | 22.840106 | 0.000003 |
standrds | 0.669898 | 0.995102 | 1.107397 | 0.293777 |
#ex. les variables pertinentes à 1%
adl.statistical_evaluation_.loc[adl.statistical_evaluation_['p-value']<=0.01]
Wilks L. | Partial L. | F(1, 225) | p-value | |
---|---|---|---|---|
profdev | 0.704069 | 0.946807 | 12.640948 | 0.000460 |
regulat | 0.691957 | 0.963379 | 8.552823 | 0.003803 |
workgrp | 0.734287 | 0.907843 | 22.840106 | 0.000003 |
#plus loin avec les stats
#matrice de vcov-intra
adl.wcov_
profdev | conflict | regulat | jobvar | workgrp | standrds | |
---|---|---|---|---|---|---|
profdev | 0.993181 | 0.759772 | -0.079565 | 0.583546 | 0.254767 | 0.164452 |
conflict | 0.759772 | 1.024159 | -0.175415 | 0.512132 | 0.329523 | 0.068012 |
regulat | -0.079565 | -0.175415 | 1.077627 | -0.144406 | -0.277869 | 0.060229 |
jobvar | 0.583546 | 0.512132 | -0.144406 | 1.068645 | 0.328425 | 0.269218 |
workgrp | 0.254767 | 0.329523 | -0.277869 | 0.328425 | 0.857176 | 0.087267 |
standrds | 0.164452 | 0.068012 | 0.060229 | 0.269218 | 0.087267 | 0.763119 |
#matrice de vcov-totale
adl.tcov_
profdev | conflict | regulat | jobvar | workgrp | standrds | |
---|---|---|---|---|---|---|
profdev | 1.219137 | 0.957495 | -0.272420 | 0.736690 | 0.501598 | 0.164595 |
conflict | 0.957495 | 1.195208 | -0.343318 | 0.645814 | 0.544542 | 0.068464 |
regulat | -0.272420 | -0.343318 | 1.235069 | -0.274399 | -0.484699 | 0.059251 |
jobvar | 0.736690 | 0.645814 | -0.274399 | 1.169264 | 0.494625 | 0.268630 |
workgrp | 0.501598 | 0.544542 | -0.484699 | 0.494625 | 1.120436 | 0.087809 |
standrds | 0.164595 | 0.068464 | 0.059251 | 0.268630 | 0.087809 | 0.759819 |
#n et K
n = adl.n_samples_
K = adl.n_classes_
#Wb - livre, page 60
#estimations corrigées des matrices
#pour le calcul du Lambda de Wilks
wb = (n-K)/n*adl.wcov_.values
#Vb
vb = (n-1)/n*adl.tcov_.values
#lambda de Wilks
import numpy
wilks = numpy.linalg.det(wb)/numpy.linalg.det(vb)
wilks
0.666617467385988
#prédiction en test
pred = adl.predict(dfTest[dfTest.columns[1:]])
pred.predict.value_counts()
good 133 poor 67 Name: predict, dtype: int64
#matrice de confusion
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_predictions(dfTest.CLASSE, pred.predict)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1e76564fe20>
#bilan classement
print(metrics.classification_report(dfTest.CLASSE, pred.predict))
precision recall f1-score support good 0.82 0.83 0.83 131 poor 0.67 0.65 0.66 69 accuracy 0.77 200 macro avg 0.75 0.74 0.74 200 weighted avg 0.77 0.77 0.77 200
#plus précisément pour l'accuracy
acc = metrics.accuracy_score(dfTest.CLASSE, pred.predict)
print(acc)
0.77
#intervalle de confiance à 90%
from statsmodels.stats.proportion import proportion_confint
proportion_confint(count=acc*dfTest.shape[0],nobs=dfTest.shape[0],alpha=0.1)
(0.7210535115804874, 0.8189464884195127)
#sélection de variables
from scientisttools.discriminant_analysis import STEPDISC
stepdisc = STEPDISC(method="backward",alpha=0.01,model_train=True,verbose=True)
stepdisc.fit(adl)
Wilks L. Partial L. F p-value profdev 0.704069 0.946807 12.640948 0.000460 conflict 0.668024 0.997894 0.474834 0.491482 regulat 0.691957 0.963379 8.552823 0.003803 jobvar 0.667397 0.998832 0.263134 0.608479 workgrp 0.734287 0.907843 22.840106 0.000003 standrds 0.669898 0.995102 1.107397 0.293777 Wilks L. Partial L. F p-value profdev 0.705277 0.946291 12.827184 0.000418 conflict 0.669015 0.997582 0.547783 0.459993 regulat 0.692297 0.964033 8.431801 0.004053 workgrp 0.734639 0.908470 22.770028 0.000003 standrds 0.671784 0.993469 1.485672 0.224160 Wilks L. Partial L. F p-value profdev 0.729807 0.916701 20.627184 0.000009 regulat 0.692968 0.965433 8.127630 0.004761 workgrp 0.734748 0.910536 22.303702 0.000004 standrds 0.672906 0.994217 1.320430 0.251724 Wilks L. Partial L. F p-value profdev 0.730442 0.921232 19.494742 0.000016 regulat 0.699619 0.961818 9.051172 0.002920 workgrp 0.736816 0.913262 21.654410 0.000006
STEPDISC(method='backward', model_train=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
STEPDISC(method='backward', model_train=True)
#modèle avec les variables sélectionnées
adl_stepdisc = stepdisc.train_model_
#coefficients
adl_stepdisc.coef_
good | poor | |
---|---|---|
profdev | 4.035209 | 3.302228 |
regulat | 5.220522 | 5.712405 |
workgrp | 6.877362 | 6.015048 |
#intercept
adl_stepdisc.intercept_
good | poor | |
---|---|---|
Intercept | -38.93664 | -33.485441 |
#liste des var. sélectionnées
adl_stepdisc.features_labels_
['profdev', 'regulat', 'workgrp']
#prédiction en test à partir des var. sélectionnées + accuracy
predSel = adl_stepdisc.predict(dfTest[adl_stepdisc.features_labels_])
metrics.accuracy_score(dfTest.CLASSE,predSel.predict)
0.765