In [1]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#lecture du fichier de donées
import pandas
D = pandas.read_excel("spam.xlsx")

#vérification
D.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 56 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   wf_make                     4601 non-null   float64
 1   wf_address                  4601 non-null   float64
 2   wf_all                      4601 non-null   float64
 3   wf_3d                       4601 non-null   float64
 4   wf_our                      4601 non-null   float64
 5   wf_over                     4601 non-null   float64
 6   wf_remove                   4601 non-null   float64
 7   wf_internet                 4601 non-null   float64
 8   wf_order                    4601 non-null   float64
 9   wf_mail                     4601 non-null   float64
 10  wf_receive                  4601 non-null   float64
 11  wf_will                     4601 non-null   float64
 12  wf_people                   4601 non-null   float64
 13  wf_report                   4601 non-null   float64
 14  wf_addresses                4601 non-null   float64
 15  wf_free                     4601 non-null   float64
 16  wf_business                 4601 non-null   float64
 17  wf_email                    4601 non-null   float64
 18  wf_you                      4601 non-null   float64
 19  wf_credit                   4601 non-null   float64
 20  wf_your                     4601 non-null   float64
 21  wf_font                     4601 non-null   float64
 22  wf_000                      4601 non-null   float64
 23  wf_money                    4601 non-null   float64
 24  wf_hp                       4601 non-null   float64
 25  wf_hpl                      4601 non-null   float64
 26  wf_lab                      4601 non-null   float64
 27  wf_labs                     4601 non-null   float64
 28  wf_telnet                   4601 non-null   float64
 29  wf_857                      4601 non-null   float64
 30  wf_data                     4601 non-null   float64
 31  wf_415                      4601 non-null   float64
 32  wf_85                       4601 non-null   float64
 33  wf_technology               4601 non-null   float64
 34  wf_1999                     4601 non-null   float64
 35  wf_parts                    4601 non-null   float64
 36  wf_pm                       4601 non-null   float64
 37  wf_direct                   4601 non-null   float64
 38  wf_cs                       4601 non-null   float64
 39  wf_meeting                  4601 non-null   float64
 40  wf_original                 4601 non-null   float64
 41  wf_project                  4601 non-null   float64
 42  wf_re                       4601 non-null   float64
 43  wf_edu                      4601 non-null   float64
 44  wf_table                    4601 non-null   float64
 45  wf_conference               4601 non-null   float64
 46  cf_comma                    4601 non-null   float64
 47  cf_bracket                  4601 non-null   float64
 48  cf_sq_bracket               4601 non-null   float64
 49  cf_exclam_point             4601 non-null   float64
 50  cf_dollar                   4601 non-null   float64
 51  cf_sharp                    4601 non-null   float64
 52  capital_run_length_average  4601 non-null   float64
 53  capital_run_length_longest  4601 non-null   int64  
 54  capital_run_length_total    4601 non-null   int64  
 55  spam                        4601 non-null   object 
dtypes: float64(53), int64(2), object(1)
memory usage: 2.0+ MB
In [2]:
#distribution des classes
D.spam.value_counts()
Out[2]:
no      2788
_yes    1813
Name: spam, dtype: int64
In [3]:
#** préparer les structures **

#matrice des explicatives
X = D[D.columns[:-1]]

#vecteur cible
y = D.spam
In [4]:
#partition app-test
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X,y,train_size=0.7,stratify=y,random_state=0)

#dim. Train
print(XTrain.shape)

#dim. Test
print(XTest.shape)
(3220, 55)
(1381, 55)
In [5]:
#modélisation avec la régression logistique
#Ridge, solver = liblinear, C = 0.01
from sklearn.linear_model import LogisticRegression
reg_first = LogisticRegression(solver="liblinear",penalty='l2',C=0.01)
reg_first.fit(XTrain,yTrain)

#affichage des coefficients
print(reg_first.coef_.shape)
(1, 55)
In [6]:
#prédiction en test
pred_first = reg_first.predict(XTest)

#performances en test
from sklearn.metrics import classification_report
print(classification_report(yTest,pred_first))
              precision    recall  f1-score   support

        _yes       0.89      0.88      0.88       544
          no       0.92      0.93      0.93       837

    accuracy                           0.91      1381
   macro avg       0.91      0.90      0.91      1381
weighted avg       0.91      0.91      0.91      1381

In [7]:
#matrice de confusion
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest,pred_first))
[[480  64]
 [ 61 776]]
In [8]:
#hyperparamètres à faire jouer
#pénalité : Ridge ou Lasso
#régularisation : C
parametres = [{'penalty':['l1','l2'],'C':[0.01,0.1,1.0,10.0,100.0]}]

#grille de recherche
from sklearn.model_selection import GridSearchCV
grille = GridSearchCV(estimator=reg_first,param_grid=parametres,scoring='accuracy',cv=10)
grille.fit(XTrain,yTrain)
Out[8]:
GridSearchCV(cv=10, estimator=LogisticRegression(C=0.01, solver='liblinear'),
             param_grid=[{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
                          'penalty': ['l1', 'l2']}],
             scoring='accuracy')
In [9]:
#affichage des résultats
print(pandas.DataFrame(grille.cv_results_).loc[:,['params','mean_test_score']])
                          params  mean_test_score
0   {'C': 0.01, 'penalty': 'l1'}         0.822981
1   {'C': 0.01, 'penalty': 'l2'}         0.896584
2    {'C': 0.1, 'penalty': 'l1'}         0.900932
3    {'C': 0.1, 'penalty': 'l2'}         0.909006
4    {'C': 1.0, 'penalty': 'l1'}         0.910870
5    {'C': 1.0, 'penalty': 'l2'}         0.909938
6   {'C': 10.0, 'penalty': 'l1'}         0.911491
7   {'C': 10.0, 'penalty': 'l2'}         0.911180
8  {'C': 100.0, 'penalty': 'l1'}         0.909938
9  {'C': 100.0, 'penalty': 'l2'}         0.911180
In [10]:
#meilleur paramètre
print(grille.best_params_)
{'C': 10.0, 'penalty': 'l1'}
In [11]:
#meilleur score
print(grille.best_score_)
0.9114906832298135
In [12]:
#prédiction avec le meilleur modèle
#on aurait pu faire grille.predict() directement - cf. la doc
pred_best = grille.best_estimator_.predict(XTest)

#performances mesurées
print(classification_report(yTest,pred_best))
              precision    recall  f1-score   support

        _yes       0.93      0.88      0.90       544
          no       0.93      0.95      0.94       837

    accuracy                           0.93      1381
   macro avg       0.93      0.92      0.92      1381
weighted avg       0.93      0.93      0.93      1381

In [13]:
#matrice de confusion
print(confusion_matrix(yTest,pred_best))
[[481  63]
 [ 39 798]]
In [14]:
#gain en nombre de bien classés
print((481+798)-(480+776))
23