In [ ]:
#version de Python
import sys
sys.version
Out[ ]:
'3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]'
In [ ]:
# version de pycaret
import pycaret
print(pycaret.__version__)
3.2.0
In [ ]:
#version de mlflow
import mlflow
mlflow.__version__
Out[ ]:
'1.30.1'

Importation des données¶

In [ ]:
# changer de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [ ]:
# chargement des données étiquetées
import pandas
DLabeled = pandas.read_excel("image.xlsx",sheet_name=0)
DLabeled.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2310 entries, 0 to 2309
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   classe                2310 non-null   object 
 1   REGION_CENTROID_COL   2310 non-null   int64  
 2   REGION_CENTROID_ROW   2310 non-null   int64  
 3   REGION_PIXEL_COUNT    2310 non-null   int64  
 4   SHORT_LINE_DENSITY_5  2310 non-null   float64
 5   SHORT_LINE_DENSITY_2  2310 non-null   float64
 6   VEDGE_MEAN            2310 non-null   float64
 7   VEDGE_SD              2310 non-null   float64
 8   HEDGE_MEAN            2310 non-null   float64
 9   HEDGE_SD              2310 non-null   float64
 10  INTENSITY_MEAN        2310 non-null   float64
 11  RAWRED_MEAN           2310 non-null   float64
 12  RAWBLUE_MEAN          2310 non-null   float64
 13  RAWGREEN_MEAN         2310 non-null   float64
 14  EXRED_MEAN            2310 non-null   float64
 15  EXBLUE_MEAN           2310 non-null   float64
 16  EXGREEN_MEAN          2310 non-null   float64
 17  VALUE_MEAN            2310 non-null   float64
 18  SATURATION_MEAN       2310 non-null   float64
 19  HUE_MEAN              2310 non-null   float64
dtypes: float64(16), int64(3), object(1)
memory usage: 361.1+ KB
In [ ]:
# distribution des classes
print(DLabeled.classe.value_counts())
BRICKFACE    330
SKY          330
FOLIAGE      330
CEMENT       330
WINDOW       330
PATH         330
GRASS        330
Name: classe, dtype: int64

Comparaison des algorithmes¶

In [ ]:
#importation de l'outil d'expérimentation de PyCaret
from pycaret.classification import ClassificationExperiment
In [ ]:
# création d'une session de travain
session_prim = ClassificationExperiment()
session_prim.setup(data=DLabeled,normalize=True,target='classe',
                   train_size=0.7,data_split_stratify=True,fold=5,session_id=0,
                   log_experiment=True,experiment_name="recherche_modele")
  Description Value
0 Session id 0
1 Target classe
2 Target type Multiclass
3 Target mapping BRICKFACE: 0, CEMENT: 1, FOLIAGE: 2, GRASS: 3, PATH: 4, SKY: 5, WINDOW: 6
4 Original data shape (2310, 20)
5 Transformed data shape (2310, 20)
6 Transformed train set shape (1617, 20)
7 Transformed test set shape (693, 20)
8 Numeric features 19
9 Preprocess True
10 Imputation type simple
11 Numeric imputation mean
12 Categorical imputation mode
13 Normalize True
14 Normalize method zscore
15 Fold Generator StratifiedKFold
16 Fold Number 5
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment MlflowLogger
20 Experiment Name recherche_modele
21 USI 379f
Out[ ]:
<pycaret.classification.oop.ClassificationExperiment at 0x1b6745267d0>
In [ ]:
# algorithmes disponibles pour l'environnement créé
algos = session_prim.models()
print(algos)
                                     Name  \
ID                                          
lr                    Logistic Regression   
knn                K Neighbors Classifier   
nb                            Naive Bayes   
dt               Decision Tree Classifier   
svm                   SVM - Linear Kernel   
rbfsvm                SVM - Radial Kernel   
gpc           Gaussian Process Classifier   
mlp                        MLP Classifier   
ridge                    Ridge Classifier   
rf               Random Forest Classifier   
qda       Quadratic Discriminant Analysis   
ada                  Ada Boost Classifier   
gbc          Gradient Boosting Classifier   
lda          Linear Discriminant Analysis   
et                 Extra Trees Classifier   
lightgbm  Light Gradient Boosting Machine   
catboost              CatBoost Classifier   
dummy                    Dummy Classifier   

                                                  Reference  Turbo  
ID                                                                  
lr        sklearn.linear_model._logistic.LogisticRegression   True  
knn       sklearn.neighbors._classification.KNeighborsCl...   True  
nb                           sklearn.naive_bayes.GaussianNB   True  
dt             sklearn.tree._classes.DecisionTreeClassifier   True  
svm       sklearn.linear_model._stochastic_gradient.SGDC...   True  
rbfsvm                             sklearn.svm._classes.SVC  False  
gpc       sklearn.gaussian_process._gpc.GaussianProcessC...  False  
mlp       sklearn.neural_network._multilayer_perceptron....  False  
ridge           sklearn.linear_model._ridge.RidgeClassifier   True  
rf          sklearn.ensemble._forest.RandomForestClassifier   True  
qda       sklearn.discriminant_analysis.QuadraticDiscrim...   True  
ada       sklearn.ensemble._weight_boosting.AdaBoostClas...   True  
gbc         sklearn.ensemble._gb.GradientBoostingClassifier   True  
lda       sklearn.discriminant_analysis.LinearDiscrimina...   True  
et            sklearn.ensemble._forest.ExtraTreesClassifier   True  
lightgbm                    lightgbm.sklearn.LGBMClassifier   True  
catboost                   catboost.core.CatBoostClassifier   True  
dummy                         sklearn.dummy.DummyClassifier   True  
In [ ]:
# comparer et selectionner les modèles
top_models = session_prim.compare_models(sort='Accuracy',include=['lr','nb','dt','svm','lda'])
  Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
dt Decision Tree Classifier 0.9511 0.9715 0.9511 0.9524 0.9512 0.9430 0.9432 0.0180
lr Logistic Regression 0.9301 0.9925 0.9301 0.9312 0.9303 0.9185 0.9186 0.9700
svm SVM - Linear Kernel 0.9196 0.0000 0.9196 0.9232 0.9196 0.9062 0.9070 0.0260
lda Linear Discriminant Analysis 0.9140 0.9883 0.9140 0.9165 0.9130 0.8997 0.9005 0.0180
nb Naive Bayes 0.7885 0.9677 0.7885 0.8057 0.7726 0.7533 0.7612 0.7480
In [ ]:
#obtenir le détail des résultats
session_prim.pull()
Out[ ]:
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
dt Decision Tree Classifier 0.9511 0.9715 0.9511 0.9524 0.9512 0.9430 0.9432 0.018
lr Logistic Regression 0.9301 0.9925 0.9301 0.9312 0.9303 0.9185 0.9186 0.970
svm SVM - Linear Kernel 0.9196 0.0000 0.9196 0.9232 0.9196 0.9062 0.9070 0.026
lda Linear Discriminant Analysis 0.9140 0.9883 0.9140 0.9165 0.9130 0.8997 0.9005 0.018
nb Naive Bayes 0.7885 0.9677 0.7885 0.8057 0.7726 0.7533 0.7612 0.748

Optimisation des paramètres - Arbre de décision¶

In [ ]:
session_bis = ClassificationExperiment()
session_bis.setup(data=DLabeled,normalize=True,target='classe',
                train_size=0.7,data_split_stratify=True,fold=5,session_id=0,
                log_experiment=True,experiment_name="optimisation_modele")
  Description Value
0 Session id 0
1 Target classe
2 Target type Multiclass
3 Target mapping BRICKFACE: 0, CEMENT: 1, FOLIAGE: 2, GRASS: 3, PATH: 4, SKY: 5, WINDOW: 6
4 Original data shape (2310, 20)
5 Transformed data shape (2310, 20)
6 Transformed train set shape (1617, 20)
7 Transformed test set shape (693, 20)
8 Numeric features 19
9 Preprocess True
10 Imputation type simple
11 Numeric imputation mean
12 Categorical imputation mode
13 Normalize True
14 Normalize method zscore
15 Fold Generator StratifiedKFold
16 Fold Number 5
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment MlflowLogger
20 Experiment Name optimisation_modele
21 USI 87c2
Out[ ]:
<pycaret.classification.oop.ClassificationExperiment at 0x1b678f9da10>
In [ ]:
# instanciation de l'arbre + premier paramétrage
mybest = session_bis.create_model("dt",min_samples_split=50,max_depth=5)
print(mybest)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.8457 0.9645 0.8457 0.9007 0.8310 0.8200 0.8323
1 0.8302 0.9645 0.8302 0.7529 0.7785 0.8020 0.8201
2 0.8235 0.9591 0.8235 0.7520 0.7755 0.7941 0.8134
3 0.7957 0.9521 0.7957 0.7408 0.7533 0.7616 0.7825
4 0.8173 0.9633 0.8173 0.7596 0.7737 0.7869 0.8082
Mean 0.8225 0.9607 0.8225 0.7812 0.7824 0.7929 0.8113
Std 0.0164 0.0047 0.0164 0.0601 0.0259 0.0191 0.0165
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=50, min_weight_fraction_leaf=0.0,
                       random_state=0, splitter='best')
In [ ]:
# optimisation des paramètres de l'arbre
tuned_mybest, essais = session_bis.tune_model(mybest,optimize="Accuracy",choose_better=True,
                                      custom_grid={'min_samples_split':[2,10,20],'max_depth':[5,10,None]},search_algorithm='grid',
                                      return_tuner = True)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.9599 0.9766 0.9599 0.9598 0.9596 0.9532 0.9533
1 0.9506 0.9712 0.9506 0.9520 0.9502 0.9424 0.9427
2 0.9412 0.9657 0.9412 0.9431 0.9417 0.9314 0.9315
3 0.9474 0.9693 0.9474 0.9497 0.9478 0.9386 0.9388
4 0.9567 0.9747 0.9567 0.9574 0.9568 0.9494 0.9495
Mean 0.9511 0.9715 0.9511 0.9524 0.9512 0.9430 0.9432
Std 0.0066 0.0039 0.0066 0.0059 0.0064 0.0077 0.0077
Fitting 5 folds for each of 9 candidates, totalling 45 fits
In [ ]:
#affichage des essais
pandas.DataFrame.from_dict(essais.cv_results_)[
    ['param_actual_estimator__max_depth','param_actual_estimator__min_samples_split',
     'mean_test_score']].sort_values(by='mean_test_score',ascending=False)
Out[ ]:
param_actual_estimator__max_depth param_actual_estimator__min_samples_split mean_test_score
6 None 2 0.951139
3 10 2 0.951135
7 None 10 0.949908
4 10 10 0.948045
5 10 20 0.944953
8 None 20 0.941243
0 5 2 0.823109
1 5 10 0.822492
2 5 20 0.822492
In [ ]:
# carac. du modèle "optimisé"
print(tuned_mybest)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=0, splitter='best')
In [ ]:
# évaluation du l'échantillon test
# les 30% (1.0-0.7) mis de côté au préalable
# lors de l'initialisation de la session
session_bis.predict_model(tuned_mybest)
  Model Accuracy AUC Recall Prec. F1 Kappa MCC
0 Decision Tree Classifier 0.9668 0.9806 0.9668 0.9671 0.9669 0.9613 0.9613
Out[ ]:
REGION_CENTROID_COL REGION_CENTROID_ROW REGION_PIXEL_COUNT SHORT_LINE_DENSITY_5 SHORT_LINE_DENSITY_2 VEDGE_MEAN VEDGE_SD HEDGE_MEAN HEDGE_SD INTENSITY_MEAN ... RAWGREEN_MEAN EXRED_MEAN EXBLUE_MEAN EXGREEN_MEAN VALUE_MEAN SATURATION_MEAN HUE_MEAN classe prediction_label prediction_score
1469 207 97 9 0.000000 0.0 0.888889 0.651852 0.777778 0.474074 2.629630 ... 1.444444 -6.222222 9.777778 -3.555556 5.888889 0.915344 -2.269023 FOLIAGE FOLIAGE 1.0
1368 90 101 9 0.111111 0.0 0.888889 0.385185 1.722222 1.885187 20.074074 ... 14.888889 0.777778 14.777778 -15.555555 25.000000 0.404038 -1.520125 BRICKFACE BRICKFACE 1.0
554 139 115 9 0.111111 0.0 2.111111 0.958392 0.722221 0.646929 46.740742 ... 41.444443 -12.888889 28.777779 -15.888889 56.333332 0.264115 -2.021125 CEMENT CEMENT 1.0
1082 249 178 9 0.000000 0.0 2.277778 1.769076 5.222221 4.933632 47.962963 ... 42.333336 -15.888889 32.777779 -16.888889 58.888889 0.289863 -2.102536 PATH PATH 1.0
1679 100 111 9 0.000000 0.0 3.722222 20.951847 4.111111 19.496296 4.888889 ... 3.333333 -8.000000 12.666667 -4.666667 9.111111 0.864664 -2.216473 FOLIAGE FOLIAGE 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
389 161 89 9 0.000000 0.0 4.666667 5.116423 0.166667 0.278887 3.111111 ... 2.666667 -0.666667 2.000000 -1.333333 3.777778 0.097643 -1.890484 WINDOW WINDOW 1.0
178 118 180 9 0.000000 0.0 1.944445 1.481991 3.111111 1.088663 48.555557 ... 42.555557 -13.333333 31.333334 -18.000000 59.000000 0.278822 -1.996042 PATH PATH 1.0
734 220 39 9 0.111111 0.0 0.666668 0.730297 1.333333 1.333333 113.000000 ... 108.444443 -40.666668 54.333332 -13.666667 131.111115 0.241459 -2.393564 SKY SKY 1.0
921 116 35 9 0.000000 0.0 0.833333 0.781735 0.888888 0.544333 125.185188 ... 120.444443 -34.222221 48.444443 -14.222222 141.333344 0.194853 -2.346636 SKY SKY 1.0
489 219 132 9 0.000000 0.0 1.444445 0.655462 1.111111 0.750310 35.222221 ... 30.444445 -17.000000 31.333334 -14.333333 45.666668 0.354746 -2.146969 CEMENT CEMENT 1.0

693 rows × 22 columns

In [ ]:
# model définitif pour le déploiement
# le modèle est ré-entraîné sur la totalité des
# données disponibles
modele_definitif = session_bis.finalize_model(tuned_mybest)
print(modele_definitif)
Pipeline(memory=Memory(location=None),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['REGION_CENTROID_COL',
                                             'REGION_CENTROID_ROW',
                                             'REGION_PIXEL_COUNT',
                                             'SHORT_LINE_DENSITY_5',
                                             'SHORT_LINE_DENSITY_2',
                                             'VEDGE_MEAN', 'VEDGE_SD',
                                             'HEDG...
                                    transformer=StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True))),
                ('actual_estimator',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        random_state=0, splitter='best'))],
         verbose=False)