Vérification des versions¶

In [ ]:
#version de Python
import sys
sys.version
Out[ ]:
'3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]'
In [ ]:
#version de pycaret
import pycaret
pycaret.__version__
Out[ ]:
'3.2.0'

Importation des données¶

In [ ]:
#dossier par défaut
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
donnees = pandas.read_excel("pima-subset-pycaret.xlsx")
donnees.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   diastolic  724 non-null    int64  
 1   bodymass   724 non-null    float64
 2   age        724 non-null    int64  
 3   plasma     724 non-null    int64  
 4   diabete    724 non-null    object 
dtypes: float64(1), int64(3), object(1)
memory usage: 28.4+ KB
In [ ]:
#distribution des classes
donnees.diabete.value_counts()
Out[ ]:
negative    475
positive    249
Name: diabete, dtype: int64

Expérimentation - Développement du modèle¶

Initialisation + choix de l'algo + paramètrage de la session¶

In [ ]:
#création d'une session de travail (expérimentations)
#code en mode "objet"
from pycaret.classification import ClassificationExperiment

#instaciation et initialisation
session = ClassificationExperiment()
session.setup(donnees,target="diabete",train_size=0.7,data_split_stratify=True,normalize=True,fold=5,session_id=2023)

#verif.
print(session)
  Description Value
0 Session id 2023
1 Target diabete
2 Target type Binary
3 Target mapping negative: 0, positive: 1
4 Original data shape (724, 5)
5 Transformed data shape (724, 5)
6 Transformed train set shape (506, 5)
7 Transformed test set shape (218, 5)
8 Numeric features 4
9 Preprocess True
10 Imputation type simple
11 Numeric imputation mean
12 Categorical imputation mode
13 Normalize True
14 Normalize method zscore
15 Fold Generator StratifiedKFold
16 Fold Number 5
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI 1cd6
<pycaret.classification.oop.ClassificationExperiment object at 0x000001BEF717E350>
In [ ]:
#lister les modèles dispnibles pour le classement
#aka classification supervisée
algos = session.models()
print(algos)
                                     Name  \
ID                                          
lr                    Logistic Regression   
knn                K Neighbors Classifier   
nb                            Naive Bayes   
dt               Decision Tree Classifier   
svm                   SVM - Linear Kernel   
rbfsvm                SVM - Radial Kernel   
gpc           Gaussian Process Classifier   
mlp                        MLP Classifier   
ridge                    Ridge Classifier   
rf               Random Forest Classifier   
qda       Quadratic Discriminant Analysis   
ada                  Ada Boost Classifier   
gbc          Gradient Boosting Classifier   
lda          Linear Discriminant Analysis   
et                 Extra Trees Classifier   
lightgbm  Light Gradient Boosting Machine   
catboost              CatBoost Classifier   
dummy                    Dummy Classifier   

                                                  Reference  Turbo  
ID                                                                  
lr        sklearn.linear_model._logistic.LogisticRegression   True  
knn       sklearn.neighbors._classification.KNeighborsCl...   True  
nb                           sklearn.naive_bayes.GaussianNB   True  
dt             sklearn.tree._classes.DecisionTreeClassifier   True  
svm       sklearn.linear_model._stochastic_gradient.SGDC...   True  
rbfsvm                             sklearn.svm._classes.SVC  False  
gpc       sklearn.gaussian_process._gpc.GaussianProcessC...  False  
mlp       sklearn.neural_network._multilayer_perceptron....  False  
ridge           sklearn.linear_model._ridge.RidgeClassifier   True  
rf          sklearn.ensemble._forest.RandomForestClassifier   True  
qda       sklearn.discriminant_analysis.QuadraticDiscrim...   True  
ada       sklearn.ensemble._weight_boosting.AdaBoostClas...   True  
gbc         sklearn.ensemble._gb.GradientBoostingClassifier   True  
lda       sklearn.discriminant_analysis.LinearDiscrimina...   True  
et            sklearn.ensemble._forest.ExtraTreesClassifier   True  
lightgbm                    lightgbm.sklearn.LGBMClassifier   True  
catboost                   catboost.core.CatBoostClassifier   True  
dummy                         sklearn.dummy.DummyClassifier   True  

Premier modèle avec les paramètres par défaut¶

In [ ]:
#modélisation sur le train avec la régression logistique
modele_first = session.create_model("lr")
print(modele_first)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.7549 0.8111 0.7549 0.7488 0.7499 0.4371 0.4399
1 0.7426 0.7682 0.7426 0.7344 0.7356 0.3974 0.4013
2 0.7129 0.7615 0.7129 0.7077 0.7096 0.3530 0.3538
3 0.7921 0.8398 0.7921 0.7920 0.7807 0.5043 0.5222
4 0.7822 0.8662 0.7822 0.7793 0.7717 0.4845 0.4983
Mean 0.7569 0.8094 0.7569 0.7524 0.7495 0.4353 0.4431
Std 0.0284 0.0404 0.0284 0.0304 0.0255 0.0555 0.0617
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2023, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [ ]:
#évaluation sur l'échantillon test
res = session.predict_model(modele_first)

#affichage - on a le résumé + un data frame des observé vs. prédictions
print(res[['diabete','prediction_label','prediction_score']])
  Model Accuracy AUC Recall Prec. F1 Kappa MCC
0 Logistic Regression 0.7752 0.8546 0.7752 0.7698 0.7672 0.4736 0.4817
      diabete prediction_label  prediction_score
330  negative         negative            0.9112
190  negative         negative            0.8536
699  positive         negative            0.5047
17   negative         negative            0.5992
660  positive         positive            0.7929
..        ...              ...               ...
672  positive         positive            0.6954
361  negative         negative            0.8807
1    negative         negative            0.9301
544  positive         positive            0.9355
410  negative         negative            0.6067

[218 rows x 3 columns]

Recherche du meilleur modèle¶

In [ ]:
#optimiser les paramètres de l'algo
modele_best = session.tune_model(modele_first,optimize="Accuracy")
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.7549 0.8124 0.7549 0.7488 0.7499 0.4371 0.4399
1 0.7426 0.7678 0.7426 0.7344 0.7356 0.3974 0.4013
2 0.7327 0.7645 0.7327 0.7257 0.7273 0.3892 0.3918
3 0.8020 0.8398 0.8020 0.8057 0.7897 0.5245 0.5472
4 0.7822 0.8662 0.7822 0.7793 0.7717 0.4845 0.4983
Mean 0.7629 0.8101 0.7629 0.7588 0.7548 0.4465 0.4557
Std 0.0257 0.0398 0.0257 0.0297 0.0230 0.0516 0.0591
Fitting 5 folds for each of 10 candidates, totalling 50 fits
In [ ]:
#affichage du modèle
modele_best
Out[ ]:
LogisticRegression(C=0.179, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2023, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.179, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2023, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [ ]:
#vérif. de nouveau en test
session.predict_model(modele_best)
  Model Accuracy AUC Recall Prec. F1 Kappa MCC
0 Logistic Regression 0.7798 0.8542 0.7798 0.7750 0.7713 0.4826 0.4920
Out[ ]:
diastolic bodymass age plasma diabete prediction_label prediction_score
330 82 34.400002 46 61 negative negative 0.9007
190 68 27.299999 32 108 negative negative 0.8444
699 94 32.700001 45 140 positive negative 0.5031
17 88 39.299999 27 126 negative negative 0.5953
660 88 35.000000 52 168 positive positive 0.7775
... ... ... ... ... ... ... ...
672 78 33.799999 31 173 positive positive 0.6795
361 54 22.299999 24 119 negative negative 0.8736
1 66 26.600000 31 85 negative negative 0.9220
544 70 34.700001 62 197 positive positive 0.9217
410 75 29.900000 28 147 negative negative 0.6063

218 rows × 7 columns

In [ ]:
#affichage de l'importance des variables
session.plot_model(modele_best,plot="feature")
In [ ]:
#coefficients
temp_df = pandas.DataFrame({'var':modele_best.feature_names_in_,'coef':modele_best.coef_[0]})
temp_df
Out[ ]:
var coef
0 diastolic -0.049967
1 bodymass 0.578019
2 age 0.338531
3 plasma 0.918690

Modèle définitif entraîné sur la totalité des données¶

In [ ]:
#modele définitif
modele_final = session.finalize_model(modele_best)

#c'est un pipeline en réalité
#avec un preprocessing par défaut
modele_final
Out[ ]:
Pipeline(memory=Memory(location=None),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['diastolic', 'bodymass', 'age',
                                             'plasma'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              mis...
                 TransformerWrapper(exclude=None, include=None,
                                    transformer=StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True))),
                ('actual_estimator',
                 LogisticRegression(C=0.179, class_weight={}, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=2023,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=Memory(location=None),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['diastolic', 'bodymass', 'age',
                                             'plasma'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              mis...
                 TransformerWrapper(exclude=None, include=None,
                                    transformer=StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True))),
                ('actual_estimator',
                 LogisticRegression(C=0.179, class_weight={}, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=2023,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
TransformerWrapperWithInverse(exclude=None, include=None,
                              transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(exclude=None,
                   include=['diastolic', 'bodymass', 'age', 'plasma'],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='mean',
                                             verbose='deprecated'))
SimpleImputer()
SimpleImputer()
TransformerWrapper(exclude=None, include=[],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='most_frequent',
                                             verbose='deprecated'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(exclude=None, include=None,
                   transformer=StandardScaler(copy=True, with_mean=True,
                                              with_std=True))
StandardScaler()
StandardScaler()
LogisticRegression(C=0.179, class_weight={}, max_iter=1000, random_state=2023)

Déploiement¶

In [ ]:
#création d'une API pour le déploiement
mon_api = session.create_api(modele_final,"api_ricco")
API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python api_ricco.py