Vérification des versions¶

In [ ]:
#version de Python
import sys
sys.version
Out[ ]:
'3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]'
In [ ]:
#version de pycaret
import pycaret
pycaret.__version__
Out[ ]:
'3.2.0'

Importation des données¶

In [ ]:
#dossier par défaut
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
donnees = pandas.read_excel("pima-subset-pycaret.xlsx")
donnees.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   diastolic  724 non-null    int64  
 1   bodymass   724 non-null    float64
 2   age        724 non-null    int64  
 3   plasma     724 non-null    int64  
 4   diabete    724 non-null    object 
dtypes: float64(1), int64(3), object(1)
memory usage: 28.4+ KB
In [ ]:
#distribution des classes
donnees.diabete.value_counts()
Out[ ]:
negative    475
positive    249
Name: diabete, dtype: int64

Expérimentation - Développement du modèle¶

Initialisation + choix de l'algo + paramètrage de la session¶

In [ ]:
#création d'une session de travail (expérimentations)
#code en mode "objet"
from pycaret.classification import ClassificationExperiment

#instaciation et initialisation
session = ClassificationExperiment()
session.setup(donnees,target="diabete",train_size=0.7,data_split_stratify=True,normalize=True,fold=5,session_id=2023)

#verif.
print(session)
  Description Value
0 Session id 2023
1 Target diabete
2 Target type Binary
3 Target mapping negative: 0, positive: 1
4 Original data shape (724, 5)
5 Transformed data shape (724, 5)
6 Transformed train set shape (506, 5)
7 Transformed test set shape (218, 5)
8 Numeric features 4
9 Preprocess True
10 Imputation type simple
11 Numeric imputation mean
12 Categorical imputation mode
13 Normalize True
14 Normalize method zscore
15 Fold Generator StratifiedKFold
16 Fold Number 5
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI a147
<pycaret.classification.oop.ClassificationExperiment object at 0x000001A161FE2C10>
In [ ]:
#lister les modèles dispnibles pour le classement
#aka classification supervisée
algos = session.models()
print(algos)
                                     Name  \
ID                                          
lr                    Logistic Regression   
knn                K Neighbors Classifier   
nb                            Naive Bayes   
dt               Decision Tree Classifier   
svm                   SVM - Linear Kernel   
rbfsvm                SVM - Radial Kernel   
gpc           Gaussian Process Classifier   
mlp                        MLP Classifier   
ridge                    Ridge Classifier   
rf               Random Forest Classifier   
qda       Quadratic Discriminant Analysis   
ada                  Ada Boost Classifier   
gbc          Gradient Boosting Classifier   
lda          Linear Discriminant Analysis   
et                 Extra Trees Classifier   
lightgbm  Light Gradient Boosting Machine   
catboost              CatBoost Classifier   
dummy                    Dummy Classifier   

                                                  Reference  Turbo  
ID                                                                  
lr        sklearn.linear_model._logistic.LogisticRegression   True  
knn       sklearn.neighbors._classification.KNeighborsCl...   True  
nb                           sklearn.naive_bayes.GaussianNB   True  
dt             sklearn.tree._classes.DecisionTreeClassifier   True  
svm       sklearn.linear_model._stochastic_gradient.SGDC...   True  
rbfsvm                             sklearn.svm._classes.SVC  False  
gpc       sklearn.gaussian_process._gpc.GaussianProcessC...  False  
mlp       sklearn.neural_network._multilayer_perceptron....  False  
ridge           sklearn.linear_model._ridge.RidgeClassifier   True  
rf          sklearn.ensemble._forest.RandomForestClassifier   True  
qda       sklearn.discriminant_analysis.QuadraticDiscrim...   True  
ada       sklearn.ensemble._weight_boosting.AdaBoostClas...   True  
gbc         sklearn.ensemble._gb.GradientBoostingClassifier   True  
lda       sklearn.discriminant_analysis.LinearDiscrimina...   True  
et            sklearn.ensemble._forest.ExtraTreesClassifier   True  
lightgbm                    lightgbm.sklearn.LGBMClassifier   True  
catboost                   catboost.core.CatBoostClassifier   True  
dummy                         sklearn.dummy.DummyClassifier   True  

Premier modèle avec les paramètres par défaut¶

In [ ]:
#modélisation sur le train avec la régression logistique
modele = session.create_model("lr")
print(modele)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.7549 0.8111 0.7549 0.7488 0.7499 0.4371 0.4399
1 0.7426 0.7682 0.7426 0.7344 0.7356 0.3974 0.4013
2 0.7129 0.7615 0.7129 0.7077 0.7096 0.3530 0.3538
3 0.7921 0.8398 0.7921 0.7920 0.7807 0.5043 0.5222
4 0.7822 0.8662 0.7822 0.7793 0.7717 0.4845 0.4983
Mean 0.7569 0.8094 0.7569 0.7524 0.7495 0.4353 0.4431
Std 0.0284 0.0404 0.0284 0.0304 0.0255 0.0555 0.0617
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2023, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [ ]:
#évaluation sur l'échantillon test
res = session.predict_model(modele)

#affichage - on a le résumé + un data frame des observé vs. prédictions
print(res[['diabete','prediction_label','prediction_score']])
  Model Accuracy AUC Recall Prec. F1 Kappa MCC
0 Logistic Regression 0.7752 0.8546 0.7752 0.7698 0.7672 0.4736 0.4817
      diabete prediction_label  prediction_score
330  negative         negative            0.9112
190  negative         negative            0.8536
699  positive         negative            0.5047
17   negative         negative            0.5992
660  positive         positive            0.7929
..        ...              ...               ...
672  positive         positive            0.6954
361  negative         negative            0.8807
1    negative         negative            0.9301
544  positive         positive            0.9355
410  negative         negative            0.6067

[218 rows x 3 columns]
In [ ]:
#affichage de l'importance des variables
session.plot_model(modele,plot="feature")

Modèle définitif entraîné sur la totalité des données¶

In [ ]:
#modele définitif
modele_final = session.finalize_model(modele)

#c'est un pipeline en réalité
#avec un preprocessing par défaut
modele_final
Out[ ]:
Pipeline(memory=Memory(location=None),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['diastolic', 'bodymass', 'age',
                                             'plasma'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              mis...
                 TransformerWrapper(exclude=None, include=None,
                                    transformer=StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True))),
                ('actual_estimator',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=2023,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=Memory(location=None),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['diastolic', 'bodymass', 'age',
                                             'plasma'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              mis...
                 TransformerWrapper(exclude=None, include=None,
                                    transformer=StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True))),
                ('actual_estimator',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=2023,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
TransformerWrapperWithInverse(exclude=None, include=None,
                              transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(exclude=None,
                   include=['diastolic', 'bodymass', 'age', 'plasma'],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='mean',
                                             verbose='deprecated'))
SimpleImputer()
SimpleImputer()
TransformerWrapper(exclude=None, include=[],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='most_frequent',
                                             verbose='deprecated'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(exclude=None, include=None,
                   transformer=StandardScaler(copy=True, with_mean=True,
                                              with_std=True))
StandardScaler()
StandardScaler()
LogisticRegression(max_iter=1000, random_state=2023)

Sauvegarde pour le déploiement¶

In [ ]:
#il faut créer d'abord le fichier pickle
session.save_model(modele_final,"modele_ricco")
Transformation Pipeline and Model Successfully Saved
Out[ ]:
(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['diastolic', 'bodymass', 'age',
                                              'plasma'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               mis...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=StandardScaler(copy=True,
                                                                with_mean=True,
                                                                with_std=True))),
                 ('actual_estimator',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=1000,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l2', random_state=2023,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False),
 'modele_ricco.pkl')