#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#lecture du fichier de donées
import pandas
D = pandas.read_excel("spam.xlsx")
#vérification
D.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4601 entries, 0 to 4600 Data columns (total 56 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 wf_make 4601 non-null float64 1 wf_address 4601 non-null float64 2 wf_all 4601 non-null float64 3 wf_3d 4601 non-null float64 4 wf_our 4601 non-null float64 5 wf_over 4601 non-null float64 6 wf_remove 4601 non-null float64 7 wf_internet 4601 non-null float64 8 wf_order 4601 non-null float64 9 wf_mail 4601 non-null float64 10 wf_receive 4601 non-null float64 11 wf_will 4601 non-null float64 12 wf_people 4601 non-null float64 13 wf_report 4601 non-null float64 14 wf_addresses 4601 non-null float64 15 wf_free 4601 non-null float64 16 wf_business 4601 non-null float64 17 wf_email 4601 non-null float64 18 wf_you 4601 non-null float64 19 wf_credit 4601 non-null float64 20 wf_your 4601 non-null float64 21 wf_font 4601 non-null float64 22 wf_000 4601 non-null float64 23 wf_money 4601 non-null float64 24 wf_hp 4601 non-null float64 25 wf_hpl 4601 non-null float64 26 wf_lab 4601 non-null float64 27 wf_labs 4601 non-null float64 28 wf_telnet 4601 non-null float64 29 wf_857 4601 non-null float64 30 wf_data 4601 non-null float64 31 wf_415 4601 non-null float64 32 wf_85 4601 non-null float64 33 wf_technology 4601 non-null float64 34 wf_1999 4601 non-null float64 35 wf_parts 4601 non-null float64 36 wf_pm 4601 non-null float64 37 wf_direct 4601 non-null float64 38 wf_cs 4601 non-null float64 39 wf_meeting 4601 non-null float64 40 wf_original 4601 non-null float64 41 wf_project 4601 non-null float64 42 wf_re 4601 non-null float64 43 wf_edu 4601 non-null float64 44 wf_table 4601 non-null float64 45 wf_conference 4601 non-null float64 46 cf_comma 4601 non-null float64 47 cf_bracket 4601 non-null float64 48 cf_sq_bracket 4601 non-null float64 49 cf_exclam_point 4601 non-null float64 50 cf_dollar 4601 non-null float64 51 cf_sharp 4601 non-null float64 52 capital_run_length_average 4601 non-null float64 53 capital_run_length_longest 4601 non-null int64 54 capital_run_length_total 4601 non-null int64 55 spam 4601 non-null object dtypes: float64(53), int64(2), object(1) memory usage: 2.0+ MB
#distribution des classes
D.spam.value_counts()
no 2788 _yes 1813 Name: spam, dtype: int64
#** préparer les structures **
#matrice des explicatives
X = D[D.columns[:-1]]
#vecteur cible
y = D.spam
#partition app-test
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X,y,train_size=0.7,stratify=y,random_state=0)
#dim. Train
print(XTrain.shape)
#dim. Test
print(XTest.shape)
(3220, 55) (1381, 55)
#modélisation avec la régression logistique
#Ridge, solver = liblinear, C = 0.01
from sklearn.linear_model import LogisticRegression
reg_first = LogisticRegression(solver="liblinear",penalty='l2',C=0.01)
reg_first.fit(XTrain,yTrain)
#affichage des coefficients
print(reg_first.coef_.shape)
(1, 55)
#prédiction en test
pred_first = reg_first.predict(XTest)
#performances en test
from sklearn.metrics import classification_report
print(classification_report(yTest,pred_first))
precision recall f1-score support _yes 0.89 0.88 0.88 544 no 0.92 0.93 0.93 837 accuracy 0.91 1381 macro avg 0.91 0.90 0.91 1381 weighted avg 0.91 0.91 0.91 1381
#matrice de confusion
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest,pred_first))
[[480 64] [ 61 776]]
#hyperparamètres à faire jouer
#pénalité : Ridge ou Lasso
#régularisation : C
parametres = [{'penalty':['l1','l2'],'C':[0.01,0.1,1.0,10.0,100.0]}]
#grille de recherche
from sklearn.model_selection import GridSearchCV
grille = GridSearchCV(estimator=reg_first,param_grid=parametres,scoring='accuracy',cv=10)
grille.fit(XTrain,yTrain)
GridSearchCV(cv=10, estimator=LogisticRegression(C=0.01, solver='liblinear'), param_grid=[{'C': [0.01, 0.1, 1.0, 10.0, 100.0], 'penalty': ['l1', 'l2']}], scoring='accuracy')
#affichage des résultats
print(pandas.DataFrame(grille.cv_results_).loc[:,['params','mean_test_score']])
params mean_test_score 0 {'C': 0.01, 'penalty': 'l1'} 0.822981 1 {'C': 0.01, 'penalty': 'l2'} 0.896584 2 {'C': 0.1, 'penalty': 'l1'} 0.900932 3 {'C': 0.1, 'penalty': 'l2'} 0.909006 4 {'C': 1.0, 'penalty': 'l1'} 0.910870 5 {'C': 1.0, 'penalty': 'l2'} 0.909938 6 {'C': 10.0, 'penalty': 'l1'} 0.911491 7 {'C': 10.0, 'penalty': 'l2'} 0.911180 8 {'C': 100.0, 'penalty': 'l1'} 0.909938 9 {'C': 100.0, 'penalty': 'l2'} 0.911180
#meilleur paramètre
print(grille.best_params_)
{'C': 10.0, 'penalty': 'l1'}
#meilleur score
print(grille.best_score_)
0.9114906832298135
#prédiction avec le meilleur modèle
#on aurait pu faire grille.predict() directement - cf. la doc
pred_best = grille.best_estimator_.predict(XTest)
#performances mesurées
print(classification_report(yTest,pred_best))
precision recall f1-score support _yes 0.93 0.88 0.90 544 no 0.93 0.95 0.94 837 accuracy 0.93 1381 macro avg 0.93 0.92 0.92 1381 weighted avg 0.93 0.93 0.93 1381
#matrice de confusion
print(confusion_matrix(yTest,pred_best))
[[481 63] [ 39 798]]
#gain en nombre de bien classés
print((481+798)-(480+776))
23