Version¶
In [1]:
#version
import mlxtend
mlxtend.__version__
Out[1]:
'0.23.4'
Importation et préparation des données¶
In [2]:
#dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement
import pandas
df = pandas.read_excel("spam_sous_partie.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4601 entries, 0 to 4600 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 wf_make 4601 non-null float64 1 wf_address 4601 non-null float64 2 wf_all 4601 non-null float64 3 wf_3d 4601 non-null float64 4 wf_our 4601 non-null float64 5 wf_over 4601 non-null float64 6 wf_remove 4601 non-null float64 7 wf_internet 4601 non-null float64 8 wf_order 4601 non-null float64 9 wf_mail 4601 non-null float64 10 wf_receive 4601 non-null float64 11 wf_will 4601 non-null float64 12 wf_people 4601 non-null float64 13 wf_report 4601 non-null float64 14 wf_addresses 4601 non-null float64 15 wf_free 4601 non-null float64 16 wf_business 4601 non-null float64 17 wf_email 4601 non-null float64 18 wf_you 4601 non-null float64 19 wf_credit 4601 non-null float64 20 wf_your 4601 non-null float64 21 wf_font 4601 non-null float64 22 wf_000 4601 non-null float64 23 wf_money 4601 non-null float64 24 wf_hp 4601 non-null float64 25 wf_hpl 4601 non-null float64 26 wf_lab 4601 non-null float64 27 wf_labs 4601 non-null float64 28 wf_telnet 4601 non-null float64 29 wf_857 4601 non-null float64 30 wf_data 4601 non-null float64 31 wf_415 4601 non-null float64 32 wf_85 4601 non-null float64 33 wf_technology 4601 non-null float64 34 wf_1999 4601 non-null float64 35 wf_parts 4601 non-null float64 36 wf_pm 4601 non-null float64 37 wf_direct 4601 non-null float64 38 wf_cs 4601 non-null float64 39 wf_meeting 4601 non-null float64 40 wf_original 4601 non-null float64 41 wf_project 4601 non-null float64 42 wf_re 4601 non-null float64 43 wf_edu 4601 non-null float64 44 wf_table 4601 non-null float64 45 wf_conference 4601 non-null float64 46 cf_; 4601 non-null float64 47 cf_( 4601 non-null float64 48 cf_[ 4601 non-null float64 49 cf_! 4601 non-null float64 50 cf_$ 4601 non-null float64 51 cf_# 4601 non-null float64 52 spam 4601 non-null object dtypes: float64(52), object(1) memory usage: 1.9+ MB
In [3]:
#préparation des structures
X = df[df.columns[:-1]]
y = df.spam
In [4]:
#partition train-test
#on se place dans une position problématique pour les besoins de l'expé
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X,y,train_size=601,random_state=1,stratify=y)
#dimensions
print(XTrain.shape)
print(XTest.shape)
(601, 52) (4000, 52)
Modélisation - Evaluation¶
Schéma train-test¶
In [5]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
clf_NB.fit(XTrain,yTrain)
#info sur les classes - par ex. fréquences relatives marginales
clf_NB.class_prior_
Out[5]:
array([0.60565724, 0.39434276])
In [6]:
#prédiction en test
pred_NB = clf_NB.predict(XTest)
#fréq.
import numpy
numpy.unique(pred_NB,return_counts=True)
Out[6]:
(array(['no', 'yes'], dtype='<U3'), array([1725, 2275]))
In [7]:
#accuracy de mlxtend
from mlxtend.evaluate import accuracy_score
acc_NB = accuracy_score(yTest,pred_NB)
print(acc_NB)
0.76025
In [8]:
#intervalle de confiance à 90%
from scipy.stats import binomtest
#calcul de la statistique
result_NB = binomtest(int(acc_NB*XTest.shape[0]), XTest.shape[0])
#affichage
print(f"Proportion : {result_NB.statistic}")
print(result_NB.proportion_ci(confidence_level=0.90))
Proportion : 0.76025 ConfidenceInterval(low=0.7488827534979696, high=0.771335293815457)
In [9]:
#matrice de confusion
from mlxtend.evaluate import confusion_matrix
cm_NB = confusion_matrix(yTest,pred_NB)
print(cm_NB)
[[1595 829] [ 130 1446]]
In [10]:
#et si on veut un affichage graphique
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=cm_NB,class_names=clf_NB.classes_)
plt.show()
Evaluation 0.632 bootstrap¶
In [11]:
#importation de la fonction
from mlxtend.evaluate import bootstrap_point632_score
#calcul des accuracies intermédiaires - 0.632 boostrap
#ne s'appuie que sur l'échantillon TRAIN /!\
#bonne piste pour petits effectifs
acc_scores = bootstrap_point632_score(clf_NB,XTrain,yTrain,
method=".632",random_seed=213)
#print des 10 premières valeurs
print(acc_scores[:10])
[0.78454173 0.76465975 0.76912538 0.76844959 0.7829406 0.77243322 0.76482548 0.81284593 0.78766228 0.80482604]
In [12]:
#moyennes des accuracies
print(f"Accuracy (.632 bootstrap) = {numpy.mean(acc_scores)}")
Accuracy (.632 bootstrap) = 0.7767484891494922
In [13]:
#intervalle de confiance empirique à 90%
bb = numpy.percentile(acc_scores,5.0)
bh = numpy.percentile(acc_scores,95.0)
print(f"Intervalle de confiance : ({bb}, {bh})")
Intervalle de confiance : (0.740674842030313, 0.8060027019071913)
Il y a même le 0.632+ Bootstrap, au prix certes d'un temps de calcul plus élevé...
Comparaison de modèles¶
Apprentissage - test¶
In [14]:
#algo des plus proches voisins
from sklearn.neighbors import KNeighborsClassifier
#instanciation entraînement - param. par défaut 5-PPV
clf_PPV = KNeighborsClassifier()
clf_PPV.fit(XTrain,yTrain)
Out[14]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [15]:
#prédiction en test
pred_PPV = clf_PPV.predict(XTest)
#et accuracy -- nettement moins bon que Gaussian Naive Bayes
acc_PPV = accuracy_score(yTest,pred_PPV)
print(acc_PPV)
0.863
In [16]:
#calcul de la statistique
result_PPV = binomtest(int(acc_PPV*XTest.shape[0]), XTest.shape[0])
#vraiment nettement MEILLEUR que NB
#les deux intervalles de confiance ne se recouvrent pas
print(f"Proportion : {result_PPV.statistic}")
print(result_PPV.proportion_ci(confidence_level=0.90))
Proportion : 0.863 ConfidenceInterval(low=0.8537359551567779, high=0.8718694475561123)
Comparaison en validation croisée répétée¶
Approche de Dietterich (5 x 2cv)
In [17]:
#https://rasbt.github.io/mlxtend/user_guide/evaluate/paired_ttest_5x2cv/
from mlxtend.evaluate import paired_ttest_5x2cv
#ne s'appuie que sur le TRAIN /!\
#bonne piste pour petits effectifs
t, p = paired_ttest_5x2cv(estimator1=clf_PPV,
estimator2=clf_NB,
X=XTrain, y=yTrain,
scoring="accuracy",
random_seed=213)
print(f"t statistic: {t}")
print(f"p value: {p}")
t statistic: 2.899865965911326 p value: 0.03379597948962304
Approche de Alpaydin de (5 x 2cv)
In [18]:
#https://rasbt.github.io/mlxtend/user_guide/evaluate/combined_ftest_5x2cv/
from mlxtend.evaluate import combined_ftest_5x2cv
#ne s'appuie que sur le TRAIN /!\
#bonne piste pour petits effectifs
f, p = combined_ftest_5x2cv(estimator1=clf_PPV,
estimator2=clf_NB,
X=XTrain, y=yTrain,
scoring="accuracy",
random_seed=213)
print('F statistic: %.3f' % f)
print('p value: %.3f' % p)
F statistic: 6.891 p value: 0.023
Stacking de classifieurs¶
In [19]:
#concordance des prédictions en test
pandas.crosstab(pred_NB,pred_PPV)
Out[19]:
col_0 | no | yes |
---|---|---|
row_0 | ||
no | 1541 | 184 |
yes | 861 | 1414 |
Les décisions ne sont pas unanimes, il y a matière à amélioration...
Vote simple¶
In [20]:
#algo de vote
from mlxtend.classifier import EnsembleVoteClassifier
vote_clf = EnsembleVoteClassifier(clfs=[clf_NB, clf_PPV], weights=[1,1])
vote_clf.fit(XTrain,yTrain)
Out[20]:
EnsembleVoteClassifier(clfs=[GaussianNB(), KNeighborsClassifier()], weights=[1, 1])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
EnsembleVoteClassifier(clfs=[GaussianNB(), KNeighborsClassifier()], weights=[1, 1])
In [21]:
#prédiction en test
pred_vote = vote_clf.predict(XTest)
#accuracy
acc_vote = accuracy_score(yTest,pred_vote)
print(acc_vote)
0.8695
In [22]:
#pour rappel, accuracy et l'intervalle de confiance de PPV
print(f"Acc PPV : {result_PPV.statistic}")
print(result_PPV.proportion_ci(confidence_level=0.90))
Acc PPV : 0.863 ConfidenceInterval(low=0.8537359551567779, high=0.8718694475561123)
Stacking¶
Principe : utiliser un classifieur pour apprendre les pondérations des modèles à utiliser.
In [23]:
#algorithme pour calculer les poids
#attribués aux classifieurs
from sklearn.linear_model import LogisticRegression
lr_combin = LogisticRegression()
In [24]:
#stacking avec mlxtend en validation croisée
from mlxtend.classifier import StackingCVClassifier
#instanciation
#combinaison à partir des probas d'appartenance aux classes (use_probas)
stack_clf = StackingCVClassifier(classifiers=[clf_NB, clf_PPV],
use_probas=True,
meta_classifier=lr_combin,
cv=5,
random_state=1)
In [25]:
#entraînement
#ATTENTION : erreur, problème avec Scikit-Learn (à downgrader)
#ou, autre solution possible
#https://github.com/rasbt/mlxtend/issues/1117
stack_clf.fit(XTrain,yTrain)
Out[25]:
StackingCVClassifier(classifiers=[GaussianNB(), KNeighborsClassifier()], cv=5, meta_classifier=LogisticRegression(), random_state=1, use_probas=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingCVClassifier(classifiers=[GaussianNB(), KNeighborsClassifier()], cv=5, meta_classifier=LogisticRegression(), random_state=1, use_probas=True)
LogisticRegression()
LogisticRegression()
In [26]:
#coefficient de pondération des probas estimées
#4 valeurs car features :
#(1) Naive Bayes (NB) : proba spam = no, proba spam = yes
#(2) PPV : proba spam = no, proba spam = yes
stack_clf.meta_clf_.coef_
Out[26]:
array([[-1.28864366, 1.29587905, -2.17820213, 2.18543752]])
/!\ On constate que PPV compte presque 2 x plus que NB dans le meta-classifieur !!!
In [27]:
#pour info : on a aussi un intercept
stack_clf.meta_clf_.intercept_
Out[27]:
array([-0.72085533])
In [28]:
#prédiction en test
pred_stack = stack_clf.predict(XTest)
#accuracy
acc_stack = accuracy_score(yTest,pred_stack)
print(acc_stack)
0.87825
In [29]:
#pour rappel, accuracy et l'intervalle de confiance de PPV
print(f"Acc PPV : {result_PPV.statistic}")
print(result_PPV.proportion_ci(confidence_level=0.90))
Acc PPV : 0.863 ConfidenceInterval(low=0.8537359551567779, high=0.8718694475561123)
Amélioration, "accuracy" en dehors de l'intervalle de confiance de 5-PPV... mais bon, amélioration est très ténue quand-même.