#version de Python
import sys
sys.version
'3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]'
# version de pycaret
import pycaret
print(pycaret.__version__)
3.2.0
#version de mlflow
import mlflow
mlflow.__version__
'1.30.1'
# changer de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
# chargement des données étiquetées
import pandas
DLabeled = pandas.read_excel("image.xlsx",sheet_name=0)
DLabeled.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2310 entries, 0 to 2309 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 classe 2310 non-null object 1 REGION_CENTROID_COL 2310 non-null int64 2 REGION_CENTROID_ROW 2310 non-null int64 3 REGION_PIXEL_COUNT 2310 non-null int64 4 SHORT_LINE_DENSITY_5 2310 non-null float64 5 SHORT_LINE_DENSITY_2 2310 non-null float64 6 VEDGE_MEAN 2310 non-null float64 7 VEDGE_SD 2310 non-null float64 8 HEDGE_MEAN 2310 non-null float64 9 HEDGE_SD 2310 non-null float64 10 INTENSITY_MEAN 2310 non-null float64 11 RAWRED_MEAN 2310 non-null float64 12 RAWBLUE_MEAN 2310 non-null float64 13 RAWGREEN_MEAN 2310 non-null float64 14 EXRED_MEAN 2310 non-null float64 15 EXBLUE_MEAN 2310 non-null float64 16 EXGREEN_MEAN 2310 non-null float64 17 VALUE_MEAN 2310 non-null float64 18 SATURATION_MEAN 2310 non-null float64 19 HUE_MEAN 2310 non-null float64 dtypes: float64(16), int64(3), object(1) memory usage: 361.1+ KB
# distribution des classes
print(DLabeled.classe.value_counts())
BRICKFACE 330 SKY 330 FOLIAGE 330 CEMENT 330 WINDOW 330 PATH 330 GRASS 330 Name: classe, dtype: int64
#importation de l'outil d'expérimentation de PyCaret
from pycaret.classification import ClassificationExperiment
# création d'une session de travain
session_prim = ClassificationExperiment()
session_prim.setup(data=DLabeled,normalize=True,target='classe',
train_size=0.7,data_split_stratify=True,fold=5,session_id=0,
log_experiment=True,experiment_name="recherche_modele")
Description | Value | |
---|---|---|
0 | Session id | 0 |
1 | Target | classe |
2 | Target type | Multiclass |
3 | Target mapping | BRICKFACE: 0, CEMENT: 1, FOLIAGE: 2, GRASS: 3, PATH: 4, SKY: 5, WINDOW: 6 |
4 | Original data shape | (2310, 20) |
5 | Transformed data shape | (2310, 20) |
6 | Transformed train set shape | (1617, 20) |
7 | Transformed test set shape | (693, 20) |
8 | Numeric features | 19 |
9 | Preprocess | True |
10 | Imputation type | simple |
11 | Numeric imputation | mean |
12 | Categorical imputation | mode |
13 | Normalize | True |
14 | Normalize method | zscore |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 5 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | MlflowLogger |
20 | Experiment Name | recherche_modele |
21 | USI | 379f |
<pycaret.classification.oop.ClassificationExperiment at 0x1b6745267d0>
# algorithmes disponibles pour l'environnement créé
algos = session_prim.models()
print(algos)
Name \ ID lr Logistic Regression knn K Neighbors Classifier nb Naive Bayes dt Decision Tree Classifier svm SVM - Linear Kernel rbfsvm SVM - Radial Kernel gpc Gaussian Process Classifier mlp MLP Classifier ridge Ridge Classifier rf Random Forest Classifier qda Quadratic Discriminant Analysis ada Ada Boost Classifier gbc Gradient Boosting Classifier lda Linear Discriminant Analysis et Extra Trees Classifier lightgbm Light Gradient Boosting Machine catboost CatBoost Classifier dummy Dummy Classifier Reference Turbo ID lr sklearn.linear_model._logistic.LogisticRegression True knn sklearn.neighbors._classification.KNeighborsCl... True nb sklearn.naive_bayes.GaussianNB True dt sklearn.tree._classes.DecisionTreeClassifier True svm sklearn.linear_model._stochastic_gradient.SGDC... True rbfsvm sklearn.svm._classes.SVC False gpc sklearn.gaussian_process._gpc.GaussianProcessC... False mlp sklearn.neural_network._multilayer_perceptron.... False ridge sklearn.linear_model._ridge.RidgeClassifier True rf sklearn.ensemble._forest.RandomForestClassifier True qda sklearn.discriminant_analysis.QuadraticDiscrim... True ada sklearn.ensemble._weight_boosting.AdaBoostClas... True gbc sklearn.ensemble._gb.GradientBoostingClassifier True lda sklearn.discriminant_analysis.LinearDiscrimina... True et sklearn.ensemble._forest.ExtraTreesClassifier True lightgbm lightgbm.sklearn.LGBMClassifier True catboost catboost.core.CatBoostClassifier True dummy sklearn.dummy.DummyClassifier True
# comparer et selectionner les modèles
top_models = session_prim.compare_models(sort='Accuracy',include=['lr','nb','dt','svm','lda'])
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.9511 | 0.9715 | 0.9511 | 0.9524 | 0.9512 | 0.9430 | 0.9432 | 0.0180 |
lr | Logistic Regression | 0.9301 | 0.9925 | 0.9301 | 0.9312 | 0.9303 | 0.9185 | 0.9186 | 0.9700 |
svm | SVM - Linear Kernel | 0.9196 | 0.0000 | 0.9196 | 0.9232 | 0.9196 | 0.9062 | 0.9070 | 0.0260 |
lda | Linear Discriminant Analysis | 0.9140 | 0.9883 | 0.9140 | 0.9165 | 0.9130 | 0.8997 | 0.9005 | 0.0180 |
nb | Naive Bayes | 0.7885 | 0.9677 | 0.7885 | 0.8057 | 0.7726 | 0.7533 | 0.7612 | 0.7480 |
#obtenir le détail des résultats
session_prim.pull()
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.9511 | 0.9715 | 0.9511 | 0.9524 | 0.9512 | 0.9430 | 0.9432 | 0.018 |
lr | Logistic Regression | 0.9301 | 0.9925 | 0.9301 | 0.9312 | 0.9303 | 0.9185 | 0.9186 | 0.970 |
svm | SVM - Linear Kernel | 0.9196 | 0.0000 | 0.9196 | 0.9232 | 0.9196 | 0.9062 | 0.9070 | 0.026 |
lda | Linear Discriminant Analysis | 0.9140 | 0.9883 | 0.9140 | 0.9165 | 0.9130 | 0.8997 | 0.9005 | 0.018 |
nb | Naive Bayes | 0.7885 | 0.9677 | 0.7885 | 0.8057 | 0.7726 | 0.7533 | 0.7612 | 0.748 |
session_bis = ClassificationExperiment()
session_bis.setup(data=DLabeled,normalize=True,target='classe',
train_size=0.7,data_split_stratify=True,fold=5,session_id=0,
log_experiment=True,experiment_name="optimisation_modele")
Description | Value | |
---|---|---|
0 | Session id | 0 |
1 | Target | classe |
2 | Target type | Multiclass |
3 | Target mapping | BRICKFACE: 0, CEMENT: 1, FOLIAGE: 2, GRASS: 3, PATH: 4, SKY: 5, WINDOW: 6 |
4 | Original data shape | (2310, 20) |
5 | Transformed data shape | (2310, 20) |
6 | Transformed train set shape | (1617, 20) |
7 | Transformed test set shape | (693, 20) |
8 | Numeric features | 19 |
9 | Preprocess | True |
10 | Imputation type | simple |
11 | Numeric imputation | mean |
12 | Categorical imputation | mode |
13 | Normalize | True |
14 | Normalize method | zscore |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 5 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | MlflowLogger |
20 | Experiment Name | optimisation_modele |
21 | USI | 87c2 |
<pycaret.classification.oop.ClassificationExperiment at 0x1b678f9da10>
# instanciation de l'arbre + premier paramétrage
mybest = session_bis.create_model("dt",min_samples_split=50,max_depth=5)
print(mybest)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.8457 | 0.9645 | 0.8457 | 0.9007 | 0.8310 | 0.8200 | 0.8323 |
1 | 0.8302 | 0.9645 | 0.8302 | 0.7529 | 0.7785 | 0.8020 | 0.8201 |
2 | 0.8235 | 0.9591 | 0.8235 | 0.7520 | 0.7755 | 0.7941 | 0.8134 |
3 | 0.7957 | 0.9521 | 0.7957 | 0.7408 | 0.7533 | 0.7616 | 0.7825 |
4 | 0.8173 | 0.9633 | 0.8173 | 0.7596 | 0.7737 | 0.7869 | 0.8082 |
Mean | 0.8225 | 0.9607 | 0.8225 | 0.7812 | 0.7824 | 0.7929 | 0.8113 |
Std | 0.0164 | 0.0047 | 0.0164 | 0.0601 | 0.0259 | 0.0191 | 0.0165 |
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=50, min_weight_fraction_leaf=0.0, random_state=0, splitter='best')
# optimisation des paramètres de l'arbre
tuned_mybest, essais = session_bis.tune_model(mybest,optimize="Accuracy",choose_better=True,
custom_grid={'min_samples_split':[2,10,20],'max_depth':[5,10,None]},search_algorithm='grid',
return_tuner = True)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.9599 | 0.9766 | 0.9599 | 0.9598 | 0.9596 | 0.9532 | 0.9533 |
1 | 0.9506 | 0.9712 | 0.9506 | 0.9520 | 0.9502 | 0.9424 | 0.9427 |
2 | 0.9412 | 0.9657 | 0.9412 | 0.9431 | 0.9417 | 0.9314 | 0.9315 |
3 | 0.9474 | 0.9693 | 0.9474 | 0.9497 | 0.9478 | 0.9386 | 0.9388 |
4 | 0.9567 | 0.9747 | 0.9567 | 0.9574 | 0.9568 | 0.9494 | 0.9495 |
Mean | 0.9511 | 0.9715 | 0.9511 | 0.9524 | 0.9512 | 0.9430 | 0.9432 |
Std | 0.0066 | 0.0039 | 0.0066 | 0.0059 | 0.0064 | 0.0077 | 0.0077 |
Fitting 5 folds for each of 9 candidates, totalling 45 fits
#affichage des essais
pandas.DataFrame.from_dict(essais.cv_results_)[
['param_actual_estimator__max_depth','param_actual_estimator__min_samples_split',
'mean_test_score']].sort_values(by='mean_test_score',ascending=False)
param_actual_estimator__max_depth | param_actual_estimator__min_samples_split | mean_test_score | |
---|---|---|---|
6 | None | 2 | 0.951139 |
3 | 10 | 2 | 0.951135 |
7 | None | 10 | 0.949908 |
4 | 10 | 10 | 0.948045 |
5 | 10 | 20 | 0.944953 |
8 | None | 20 | 0.941243 |
0 | 5 | 2 | 0.823109 |
1 | 5 | 10 | 0.822492 |
2 | 5 | 20 | 0.822492 |
# carac. du modèle "optimisé"
print(tuned_mybest)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=0, splitter='best')
# évaluation du l'échantillon test
# les 30% (1.0-0.7) mis de côté au préalable
# lors de l'initialisation de la session
session_bis.predict_model(tuned_mybest)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Decision Tree Classifier | 0.9668 | 0.9806 | 0.9668 | 0.9671 | 0.9669 | 0.9613 | 0.9613 |
REGION_CENTROID_COL | REGION_CENTROID_ROW | REGION_PIXEL_COUNT | SHORT_LINE_DENSITY_5 | SHORT_LINE_DENSITY_2 | VEDGE_MEAN | VEDGE_SD | HEDGE_MEAN | HEDGE_SD | INTENSITY_MEAN | ... | RAWGREEN_MEAN | EXRED_MEAN | EXBLUE_MEAN | EXGREEN_MEAN | VALUE_MEAN | SATURATION_MEAN | HUE_MEAN | classe | prediction_label | prediction_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1469 | 207 | 97 | 9 | 0.000000 | 0.0 | 0.888889 | 0.651852 | 0.777778 | 0.474074 | 2.629630 | ... | 1.444444 | -6.222222 | 9.777778 | -3.555556 | 5.888889 | 0.915344 | -2.269023 | FOLIAGE | FOLIAGE | 1.0 |
1368 | 90 | 101 | 9 | 0.111111 | 0.0 | 0.888889 | 0.385185 | 1.722222 | 1.885187 | 20.074074 | ... | 14.888889 | 0.777778 | 14.777778 | -15.555555 | 25.000000 | 0.404038 | -1.520125 | BRICKFACE | BRICKFACE | 1.0 |
554 | 139 | 115 | 9 | 0.111111 | 0.0 | 2.111111 | 0.958392 | 0.722221 | 0.646929 | 46.740742 | ... | 41.444443 | -12.888889 | 28.777779 | -15.888889 | 56.333332 | 0.264115 | -2.021125 | CEMENT | CEMENT | 1.0 |
1082 | 249 | 178 | 9 | 0.000000 | 0.0 | 2.277778 | 1.769076 | 5.222221 | 4.933632 | 47.962963 | ... | 42.333336 | -15.888889 | 32.777779 | -16.888889 | 58.888889 | 0.289863 | -2.102536 | PATH | PATH | 1.0 |
1679 | 100 | 111 | 9 | 0.000000 | 0.0 | 3.722222 | 20.951847 | 4.111111 | 19.496296 | 4.888889 | ... | 3.333333 | -8.000000 | 12.666667 | -4.666667 | 9.111111 | 0.864664 | -2.216473 | FOLIAGE | FOLIAGE | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
389 | 161 | 89 | 9 | 0.000000 | 0.0 | 4.666667 | 5.116423 | 0.166667 | 0.278887 | 3.111111 | ... | 2.666667 | -0.666667 | 2.000000 | -1.333333 | 3.777778 | 0.097643 | -1.890484 | WINDOW | WINDOW | 1.0 |
178 | 118 | 180 | 9 | 0.000000 | 0.0 | 1.944445 | 1.481991 | 3.111111 | 1.088663 | 48.555557 | ... | 42.555557 | -13.333333 | 31.333334 | -18.000000 | 59.000000 | 0.278822 | -1.996042 | PATH | PATH | 1.0 |
734 | 220 | 39 | 9 | 0.111111 | 0.0 | 0.666668 | 0.730297 | 1.333333 | 1.333333 | 113.000000 | ... | 108.444443 | -40.666668 | 54.333332 | -13.666667 | 131.111115 | 0.241459 | -2.393564 | SKY | SKY | 1.0 |
921 | 116 | 35 | 9 | 0.000000 | 0.0 | 0.833333 | 0.781735 | 0.888888 | 0.544333 | 125.185188 | ... | 120.444443 | -34.222221 | 48.444443 | -14.222222 | 141.333344 | 0.194853 | -2.346636 | SKY | SKY | 1.0 |
489 | 219 | 132 | 9 | 0.000000 | 0.0 | 1.444445 | 0.655462 | 1.111111 | 0.750310 | 35.222221 | ... | 30.444445 | -17.000000 | 31.333334 | -14.333333 | 45.666668 | 0.354746 | -2.146969 | CEMENT | CEMENT | 1.0 |
693 rows × 22 columns
# model définitif pour le déploiement
# le modèle est ré-entraîné sur la totalité des
# données disponibles
modele_definitif = session_bis.finalize_model(tuned_mybest)
print(modele_definitif)
Pipeline(memory=Memory(location=None), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['REGION_CENTROID_COL', 'REGION_CENTROID_ROW', 'REGION_PIXEL_COUNT', 'SHORT_LINE_DENSITY_5', 'SHORT_LINE_DENSITY_2', 'VEDGE_MEAN', 'VEDGE_SD', 'HEDG... transformer=StandardScaler(copy=True, with_mean=True, with_std=True))), ('actual_estimator', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=0, splitter='best'))], verbose=False)