#version de Python
import sys
sys.version
'3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]'
#version de pycaret
import pycaret
pycaret.__version__
'3.2.0'
#dossier par défaut
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
donnees = pandas.read_excel("pima-subset-pycaret.xlsx")
donnees.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 724 entries, 0 to 723 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 diastolic 724 non-null int64 1 bodymass 724 non-null float64 2 age 724 non-null int64 3 plasma 724 non-null int64 4 diabete 724 non-null object dtypes: float64(1), int64(3), object(1) memory usage: 28.4+ KB
#distribution des classes
donnees.diabete.value_counts()
negative 475 positive 249 Name: diabete, dtype: int64
#création d'une session de travail (expérimentations)
#code en mode "objet"
from pycaret.classification import ClassificationExperiment
#instaciation et initialisation
session = ClassificationExperiment()
session.setup(donnees,target="diabete",train_size=0.7,data_split_stratify=True,normalize=True,fold=5,session_id=2023)
#verif.
print(session)
Description | Value | |
---|---|---|
0 | Session id | 2023 |
1 | Target | diabete |
2 | Target type | Binary |
3 | Target mapping | negative: 0, positive: 1 |
4 | Original data shape | (724, 5) |
5 | Transformed data shape | (724, 5) |
6 | Transformed train set shape | (506, 5) |
7 | Transformed test set shape | (218, 5) |
8 | Numeric features | 4 |
9 | Preprocess | True |
10 | Imputation type | simple |
11 | Numeric imputation | mean |
12 | Categorical imputation | mode |
13 | Normalize | True |
14 | Normalize method | zscore |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 5 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | a147 |
<pycaret.classification.oop.ClassificationExperiment object at 0x000001A161FE2C10>
#lister les modèles dispnibles pour le classement
#aka classification supervisée
algos = session.models()
print(algos)
Name \ ID lr Logistic Regression knn K Neighbors Classifier nb Naive Bayes dt Decision Tree Classifier svm SVM - Linear Kernel rbfsvm SVM - Radial Kernel gpc Gaussian Process Classifier mlp MLP Classifier ridge Ridge Classifier rf Random Forest Classifier qda Quadratic Discriminant Analysis ada Ada Boost Classifier gbc Gradient Boosting Classifier lda Linear Discriminant Analysis et Extra Trees Classifier lightgbm Light Gradient Boosting Machine catboost CatBoost Classifier dummy Dummy Classifier Reference Turbo ID lr sklearn.linear_model._logistic.LogisticRegression True knn sklearn.neighbors._classification.KNeighborsCl... True nb sklearn.naive_bayes.GaussianNB True dt sklearn.tree._classes.DecisionTreeClassifier True svm sklearn.linear_model._stochastic_gradient.SGDC... True rbfsvm sklearn.svm._classes.SVC False gpc sklearn.gaussian_process._gpc.GaussianProcessC... False mlp sklearn.neural_network._multilayer_perceptron.... False ridge sklearn.linear_model._ridge.RidgeClassifier True rf sklearn.ensemble._forest.RandomForestClassifier True qda sklearn.discriminant_analysis.QuadraticDiscrim... True ada sklearn.ensemble._weight_boosting.AdaBoostClas... True gbc sklearn.ensemble._gb.GradientBoostingClassifier True lda sklearn.discriminant_analysis.LinearDiscrimina... True et sklearn.ensemble._forest.ExtraTreesClassifier True lightgbm lightgbm.sklearn.LGBMClassifier True catboost catboost.core.CatBoostClassifier True dummy sklearn.dummy.DummyClassifier True
#modélisation sur le train avec la régression logistique
modele = session.create_model("lr")
print(modele)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.7549 | 0.8111 | 0.7549 | 0.7488 | 0.7499 | 0.4371 | 0.4399 |
1 | 0.7426 | 0.7682 | 0.7426 | 0.7344 | 0.7356 | 0.3974 | 0.4013 |
2 | 0.7129 | 0.7615 | 0.7129 | 0.7077 | 0.7096 | 0.3530 | 0.3538 |
3 | 0.7921 | 0.8398 | 0.7921 | 0.7920 | 0.7807 | 0.5043 | 0.5222 |
4 | 0.7822 | 0.8662 | 0.7822 | 0.7793 | 0.7717 | 0.4845 | 0.4983 |
Mean | 0.7569 | 0.8094 | 0.7569 | 0.7524 | 0.7495 | 0.4353 | 0.4431 |
Std | 0.0284 | 0.0404 | 0.0284 | 0.0304 | 0.0255 | 0.0555 | 0.0617 |
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=2023, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
#évaluation sur l'échantillon test
res = session.predict_model(modele)
#affichage - on a le résumé + un data frame des observé vs. prédictions
print(res[['diabete','prediction_label','prediction_score']])
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | 0.7752 | 0.8546 | 0.7752 | 0.7698 | 0.7672 | 0.4736 | 0.4817 |
diabete prediction_label prediction_score 330 negative negative 0.9112 190 negative negative 0.8536 699 positive negative 0.5047 17 negative negative 0.5992 660 positive positive 0.7929 .. ... ... ... 672 positive positive 0.6954 361 negative negative 0.8807 1 negative negative 0.9301 544 positive positive 0.9355 410 negative negative 0.6067 [218 rows x 3 columns]
#affichage de l'importance des variables
session.plot_model(modele,plot="feature")
#modele définitif
modele_final = session.finalize_model(modele)
#c'est un pipeline en réalité
#avec un preprocessing par défaut
modele_final
Pipeline(memory=Memory(location=None), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['diastolic', 'bodymass', 'age', 'plasma'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, mis... TransformerWrapper(exclude=None, include=None, transformer=StandardScaler(copy=True, with_mean=True, with_std=True))), ('actual_estimator', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=2023, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))], verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(memory=Memory(location=None), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['diastolic', 'bodymass', 'age', 'plasma'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, mis... TransformerWrapper(exclude=None, include=None, transformer=StandardScaler(copy=True, with_mean=True, with_std=True))), ('actual_estimator', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=2023, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))], verbose=False)
TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(exclude=None, include=['diastolic', 'bodymass', 'age', 'plasma'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='mean', verbose='deprecated'))
SimpleImputer()
SimpleImputer()
TransformerWrapper(exclude=None, include=[], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='most_frequent', verbose='deprecated'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(exclude=None, include=None, transformer=StandardScaler(copy=True, with_mean=True, with_std=True))
StandardScaler()
StandardScaler()
LogisticRegression(max_iter=1000, random_state=2023)
#il faut créer d'abord le fichier pickle
session.save_model(modele_final,"modele_ricco")
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=Memory(location=None), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['diastolic', 'bodymass', 'age', 'plasma'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, mis... TransformerWrapper(exclude=None, include=None, transformer=StandardScaler(copy=True, with_mean=True, with_std=True))), ('actual_estimator', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=2023, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))], verbose=False), 'modele_ricco.pkl')