Environnement - Packages¶
In [27]:
# activer l'environnement
using Pkg
Pkg.activate("env_mlj_comparison")
Activating project at `c:\Users\ricco\Desktop\demo\env_mlj_comparison`
In [28]:
# liste des packages installés
Pkg.status()
Status `C:\Users\ricco\Desktop\demo\env_mlj_comparison\Project.toml` [324d7699] CategoricalArrays v1.1.0 [a93c6f00] DataFrames v1.8.2 [7806a523] DecisionTree v0.12.4 [f6006082] EvoTrees v0.18.5 [7073ff75] IJulia v1.34.4 [b1bec4e5] LIBSVM v0.8.1 [add582a8] MLJ v0.23.2 [c6f25543] MLJDecisionTreeInterface v0.4.4 [61c7150f] MLJLIBSVMInterface v0.2.2 [6ee0df7b] MLJLinearModels v0.10.1 [91a5bcdd] Plots v1.41.6 [30f210dd] ScientificTypesBase v3.1.0 [2913bbd2] StatsBase v0.34.10 [fdbf4ff8] XLSX v0.11.3
Importation et préparation des données¶
Importation, inspection¶
In [29]:
# packages
import DataFrames as DFR
import XLSX
# lecture des données
df = DFR.DataFrame(XLSX.readtable("./spambase.xlsx"))
# premières lignes
println(DFR.describe(df))
56×7 DataFrame Row │ variable mean min median max nmissing eltype │ Symbol Union… Any Union… Any Int64 DataType ─────┼───────────────────────────────────────────────────────────────────────────────── 1 │ wf_make 0.104553 0.0 0.0 4.54 0 Float64 2 │ wf_address 0.213015 0.0 0.0 14.28 0 Float64 3 │ wf_all 0.280656 0.0 0.0 5.1 0 Float64 4 │ wf_3d 0.0654249 0.0 0.0 42.81 0 Float64 5 │ wf_our 0.312223 0.0 0.0 10.0 0 Float64 6 │ wf_over 0.0959009 0.0 0.0 5.88 0 Float64 7 │ wf_remove 0.114208 0.0 0.0 7.27 0 Float64 8 │ wf_internet 0.105295 0.0 0.0 11.11 0 Float64 9 │ wf_order 0.0900674 0.0 0.0 5.26 0 Float64 10 │ wf_mail 0.239413 0.0 0.0 18.18 0 Float64 11 │ wf_receive 0.059824 0.0 0.0 2.61 0 Float64 12 │ wf_will 0.541702 0.0 0.1 9.67 0 Float64 13 │ wf_people 0.0939296 0.0 0.0 5.55 0 Float64 14 │ wf_report 0.0586264 0.0 0.0 10.0 0 Float64 15 │ wf_addresses 0.0492045 0.0 0.0 4.41 0 Float64 16 │ wf_free 0.248848 0.0 0.0 20.0 0 Float64 17 │ wf_business 0.142586 0.0 0.0 7.14 0 Float64 18 │ wf_email 0.184745 0.0 0.0 9.09 0 Float64 19 │ wf_you 1.6621 0.0 1.31 18.75 0 Float64 20 │ wf_credit 0.085577 0.0 0.0 18.18 0 Float64 21 │ wf_your 0.809761 0.0 0.22 11.11 0 Float64 22 │ wf_font 0.121202 0.0 0.0 17.1 0 Float64 23 │ wf_000 0.101645 0.0 0.0 5.45 0 Float64 24 │ wf_money 0.0942686 0.0 0.0 12.5 0 Float64 25 │ wf_hp 0.549504 0.0 0.0 20.83 0 Float64 26 │ wf_hpl 0.265384 0.0 0.0 16.66 0 Float64 27 │ wf_lab 0.0989155 0.0 0.0 14.28 0 Float64 28 │ wf_labs 0.102852 0.0 0.0 5.88 0 Float64 29 │ wf_telnet 0.0647533 0.0 0.0 12.5 0 Float64 30 │ wf_857 0.0470485 0.0 0.0 4.76 0 Float64 31 │ wf_data 0.0972289 0.0 0.0 18.18 0 Float64 32 │ wf_415 0.0478353 0.0 0.0 4.76 0 Float64 33 │ wf_85 0.105412 0.0 0.0 20.0 0 Float64 34 │ wf_technology 0.0974766 0.0 0.0 7.69 0 Float64 35 │ wf_1999 0.136953 0.0 0.0 6.89 0 Float64 36 │ wf_parts 0.0132015 0.0 0.0 8.33 0 Float64 37 │ wf_pm 0.0786286 0.0 0.0 11.11 0 Float64 38 │ wf_direct 0.0648337 0.0 0.0 4.76 0 Float64 39 │ wf_cs 0.0436666 0.0 0.0 7.14 0 Float64 40 │ wf_meeting 0.132339 0.0 0.0 14.28 0 Float64 41 │ wf_original 0.0460987 0.0 0.0 3.57 0 Float64 42 │ wf_project 0.0791958 0.0 0.0 20.0 0 Float64 43 │ wf_re 0.301224 0.0 0.0 21.42 0 Float64 44 │ wf_edu 0.179824 0.0 0.0 22.05 0 Float64 45 │ wf_table 0.00544447 0.0 0.0 2.17 0 Float64 46 │ wf_conference 0.0318692 0.0 0.0 10.0 0 Float64 47 │ cf_; 0.0385747 0.0 0.0 4.385 0 Float64 48 │ cf_( 0.13903 0.0 0.065 9.752 0 Float64 49 │ cf_[ 0.0169759 0.0 0.0 4.081 0 Float64 50 │ cf_! 0.269071 0.0 0.0 32.478 0 Float64 51 │ cf_$ 0.0758107 0.0 0.0 6.003 0 Float64 52 │ cf_# 0.0442382 0.0 0.0 19.829 0 Float64 53 │ capital_run_length_average 5.19152 1.0 2.276 1102.5 0 Float64 54 │ capital_run_length_longest 52.1728 1 15.0 9989 0 Int64 55 │ capital_run_length_total 283.289 1 95.0 15841 0 Int64 56 │ spam no yes 0 String
In [30]:
# dimension
println(DFR.size(df))
(4601, 56)
In [31]:
# retirer la limitation d'affichage
ENV["LINES"] = 1000
# vérifier le schéma de la base
# et les types scientifiques
import MLJ
MLJ.schema(df)
┌────────────────────────────┬────────────┬─────────┐ │ names │ scitypes │ types │ ├────────────────────────────┼────────────┼─────────┤ │ wf_make │ Continuous │ Float64 │ │ wf_address │ Continuous │ Float64 │ │ wf_all │ Continuous │ Float64 │ │ wf_3d │ Continuous │ Float64 │ │ wf_our │ Continuous │ Float64 │ │ wf_over │ Continuous │ Float64 │ │ wf_remove │ Continuous │ Float64 │ │ wf_internet │ Continuous │ Float64 │ │ wf_order │ Continuous │ Float64 │ │ wf_mail │ Continuous │ Float64 │ │ wf_receive │ Continuous │ Float64 │ │ wf_will │ Continuous │ Float64 │ │ wf_people │ Continuous │ Float64 │ │ wf_report │ Continuous │ Float64 │ │ wf_addresses │ Continuous │ Float64 │ │ wf_free │ Continuous │ Float64 │ │ wf_business │ Continuous │ Float64 │ │ wf_email │ Continuous │ Float64 │ │ wf_you │ Continuous │ Float64 │ │ wf_credit │ Continuous │ Float64 │ │ wf_your │ Continuous │ Float64 │ │ wf_font │ Continuous │ Float64 │ │ wf_000 │ Continuous │ Float64 │ │ wf_money │ Continuous │ Float64 │ │ wf_hp │ Continuous │ Float64 │ │ wf_hpl │ Continuous │ Float64 │ │ wf_lab │ Continuous │ Float64 │ │ wf_labs │ Continuous │ Float64 │ │ wf_telnet │ Continuous │ Float64 │ │ wf_857 │ Continuous │ Float64 │ │ wf_data │ Continuous │ Float64 │ │ wf_415 │ Continuous │ Float64 │ │ wf_85 │ Continuous │ Float64 │ │ wf_technology │ Continuous │ Float64 │ │ wf_1999 │ Continuous │ Float64 │ │ wf_parts │ Continuous │ Float64 │ │ wf_pm │ Continuous │ Float64 │ │ wf_direct │ Continuous │ Float64 │ │ wf_cs │ Continuous │ Float64 │ │ wf_meeting │ Continuous │ Float64 │ │ wf_original │ Continuous │ Float64 │ │ wf_project │ Continuous │ Float64 │ │ wf_re │ Continuous │ Float64 │ │ wf_edu │ Continuous │ Float64 │ │ wf_table │ Continuous │ Float64 │ │ wf_conference │ Continuous │ Float64 │ │ cf_; │ Continuous │ Float64 │ │ cf_( │ Continuous │ Float64 │ │ cf_[ │ Continuous │ Float64 │ │ cf_! │ Continuous │ Float64 │ │ cf_$ │ Continuous │ Float64 │ │ cf_# │ Continuous │ Float64 │ │ capital_run_length_average │ Continuous │ Float64 │ │ capital_run_length_longest │ Count │ Int64 │ │ capital_run_length_total │ Count │ Int64 │ │ spam │ Textual │ String │ └────────────────────────────┴────────────┴─────────┘
Préparation des structures¶
In [32]:
# isoler y et X dans des structures distinctes
y, X = MLJ.unpack(df,==(:spam))
# dimensions
println("Dim. de y = $(DFR.size(y))")
println("Dim. de X = $(DFR.size(X))")
Dim. de y = (4601,) Dim. de X = (4601, 55)
Ajustement du type des variables¶
In [33]:
# convertir y en variable catégorielle pour la rég. logistique
# équivalent du type factor sous R
# utilisation du package CategoricalArrays
import CategoricalArrays as CA
y = CA.categorical(y)
# vérification des modalités
CA.levels(y)
2-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"no"
"yes"
In [34]:
# fréquences des classes
DFR.combine(DFR.groupby(DFR.DataFrame(y=y),:y),DFR.nrow => :freq)
2×2 DataFrame
| Row | y | freq |
|---|---|---|
| Cat… | Int64 | |
| 1 | no | 2788 |
| 2 | yes | 1813 |
In [35]:
# transformer les X count en variables continues
# utilisation du package ScientificTypesBase
import ScientificTypesBase as STB
X = MLJ.coerce(X,STB.Count => STB.Continuous)
# schéma
MLJ.schema(X)
┌────────────────────────────┬────────────┬─────────┐ │ names │ scitypes │ types │ ├────────────────────────────┼────────────┼─────────┤ │ wf_make │ Continuous │ Float64 │ │ wf_address │ Continuous │ Float64 │ │ wf_all │ Continuous │ Float64 │ │ wf_3d │ Continuous │ Float64 │ │ wf_our │ Continuous │ Float64 │ │ wf_over │ Continuous │ Float64 │ │ wf_remove │ Continuous │ Float64 │ │ wf_internet │ Continuous │ Float64 │ │ wf_order │ Continuous │ Float64 │ │ wf_mail │ Continuous │ Float64 │ │ wf_receive │ Continuous │ Float64 │ │ wf_will │ Continuous │ Float64 │ │ wf_people │ Continuous │ Float64 │ │ wf_report │ Continuous │ Float64 │ │ wf_addresses │ Continuous │ Float64 │ │ wf_free │ Continuous │ Float64 │ │ wf_business │ Continuous │ Float64 │ │ wf_email │ Continuous │ Float64 │ │ wf_you │ Continuous │ Float64 │ │ wf_credit │ Continuous │ Float64 │ │ wf_your │ Continuous │ Float64 │ │ wf_font │ Continuous │ Float64 │ │ wf_000 │ Continuous │ Float64 │ │ wf_money │ Continuous │ Float64 │ │ wf_hp │ Continuous │ Float64 │ │ wf_hpl │ Continuous │ Float64 │ │ wf_lab │ Continuous │ Float64 │ │ wf_labs │ Continuous │ Float64 │ │ wf_telnet │ Continuous │ Float64 │ │ wf_857 │ Continuous │ Float64 │ │ wf_data │ Continuous │ Float64 │ │ wf_415 │ Continuous │ Float64 │ │ wf_85 │ Continuous │ Float64 │ │ wf_technology │ Continuous │ Float64 │ │ wf_1999 │ Continuous │ Float64 │ │ wf_parts │ Continuous │ Float64 │ │ wf_pm │ Continuous │ Float64 │ │ wf_direct │ Continuous │ Float64 │ │ wf_cs │ Continuous │ Float64 │ │ wf_meeting │ Continuous │ Float64 │ │ wf_original │ Continuous │ Float64 │ │ wf_project │ Continuous │ Float64 │ │ wf_re │ Continuous │ Float64 │ │ wf_edu │ Continuous │ Float64 │ │ wf_table │ Continuous │ Float64 │ │ wf_conference │ Continuous │ Float64 │ │ cf_; │ Continuous │ Float64 │ │ cf_( │ Continuous │ Float64 │ │ cf_[ │ Continuous │ Float64 │ │ cf_! │ Continuous │ Float64 │ │ cf_$ │ Continuous │ Float64 │ │ cf_# │ Continuous │ Float64 │ │ capital_run_length_average │ Continuous │ Float64 │ │ capital_run_length_longest │ Continuous │ Float64 │ │ capital_run_length_total │ Continuous │ Float64 │ └────────────────────────────┴────────────┴─────────┘
Partition en TRAIN/TEST¶
In [36]:
# effectif
n = DFR.nrow(X)
println(n)
4601
In [37]:
# identifiants -> échantillonner 601 parmi 4601, sans remise
import StatsBase
idTrain = StatsBase.sample(1:n,601,replace=false)
println(length(idTrain))
601
In [38]:
# et les individus en test
idTest = setdiff(1:n,idTrain)
println(length(idTest))
4000
In [39]:
# structures y et X pour train/test
# par indexation avec les indices
yTrain, yTest = y[idTrain], y[idTest]
XTrain, XTest = X[idTrain,:], X[idTest,:]
# afficher les dimensions pour vérifications
println("Dim. de y = $(DFR.size(yTrain)) et $(DFR.size(yTest))")
println("Dim. de X = $(DFR.size(XTrain)) et $(DFR.size(XTest))")
Dim. de y = (601,) et (4000,) Dim. de X = (601, 55) et (4000, 55)
Comparaison des algorithmes (schéma holdout)¶
Chargement des modèles (à partir des packages spécialisés)¶
In [40]:
# chargement des modèles à comparer
DecisionTreeClassifier = @MLJ.load DecisionTreeClassifier pkg=DecisionTree
RandomForestClassifier = @MLJ.load RandomForestClassifier pkg=DecisionTree
LogisticClassifier = @MLJ.load LogisticClassifier pkg=MLJLinearModels
LinearSVC = @MLJ.load LinearSVC pkg=LIBSVM
EvoTreeClassifier = @MLJ.load EvoTreeClassifier pkg=EvoTrees
┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
import MLJDecisionTreeInterface ✔ import MLJDecisionTreeInterface ✔ import MLJLinearModels ✔ import MLJLIBSVMInterface ✔ import EvoTrees ✔
┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159 ┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159 ┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159 ┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
EvoTrees.EvoTreeClassifier
Instanciation (paramètres par défaut)¶
In [41]:
# instanciation des modèles et intégration
# dans une structure de type liste de tuples
models = [
("Decision Tree", DecisionTreeClassifier()),
("Random Forest", RandomForestClassifier()),
("Logistic Regression", LogisticClassifier()),
("Gradient Boosting", EvoTreeClassifier())
]
4-element Vector{Tuple{String, MLJModelInterface.Probabilistic}}:
("Decision Tree", DecisionTreeClassifier(max_depth = -1, …))
("Random Forest", RandomForestClassifier(max_depth = -1, …))
("Logistic Regression", LogisticClassifier(lambda = 2.220446049250313e-16, …))
("Gradient Boosting", EvoTreeClassifier(loss = mlogloss, …))
Inspection des paramètres¶
In [42]:
# paramètres de l'arbre par exemple
# instancié avec les paramètres par défaut
println("*** Parametre (par défaut) de << Arbre de décision ***")
for (param,value) in pairs(MLJ.params(DecisionTreeClassifier()))
println("$param = $value")
end
*** Parametre (par défaut) de << Arbre de décision *** max_depth = -1 min_samples_leaf = 1 min_samples_split = 2 min_purity_increase = 0.0 n_subfeatures = 0 post_prune = false merge_purity_threshold = 1.0 display_depth = 5 feature_importance = impurity rng = Random.TaskLocalRNG()
In [43]:
# paramètres pour l'ensemble des algos instaciés
# disponibles dans la liste << models >>
for i in 1:length(models)
println()
println("*** Parametre de << $(models[i][1]) >> ***")
for (param,value) in pairs(MLJ.params(models[i][2]))
println("$param = $value")
end
end
*** Parametre de << Decision Tree >> *** max_depth = -1 min_samples_leaf = 1 min_samples_split = 2 min_purity_increase = 0.0 n_subfeatures = 0 post_prune = false merge_purity_threshold = 1.0 display_depth = 5 feature_importance = impurity rng = Random.TaskLocalRNG() *** Parametre de << Random Forest >> *** max_depth = -1 min_samples_leaf = 1 min_samples_split = 2 min_purity_increase = 0.0 n_subfeatures = -1 n_trees = 100 sampling_fraction = 0.7 feature_importance = impurity rng = Random.TaskLocalRNG() *** Parametre de << Logistic Regression >> *** lambda = 2.220446049250313e-16 gamma = 0.0 penalty = l2 fit_intercept = true penalize_intercept = false scale_penalty_with_samples = true solver = nothing *** Parametre de << Gradient Boosting >> *** loss = mlogloss metric = mlogloss nrounds = 100 bagging_size = 1 early_stopping_rounds = 9223372036854775807 L2 = 1.0 lambda = 0.0 gamma = 0.0 eta = 0.1 max_depth = 6 min_weight = 1.0 rowsample = 1.0 colsample = 1.0 nbins = 64 tree_type = binary seed = 123 device = cpu
Expérimentation¶
In [44]:
# lancer l'expérimentation
# structure de récupération des résultats -> liste
results = []
# pour chaque algo
for (name, model) in models
# préparation - entraînement
mach = MLJ.machine(model, XTrain, yTrain)
MLJ.fit!(mach, verbosity=0)
# prédiction en test
ypred = MLJ.predict(mach, XTest) # proba d'appartenance
ypred = MLJ.mode.(ypred) # convertir probabilités -> classes
# accuracy
acc = MLJ.accuracy(ypred, yTest)
# rajouter le résultat sous la forme de named-tuple
# dans la liste
push!(results, (model=name, accuracy=acc))
end
# transformation de la liste en dataframe
df_results = DFR.DataFrame(results)
4×2 DataFrame
| Row | model | accuracy |
|---|---|---|
| String | Float64 | |
| 1 | Decision Tree | 0.87275 |
| 2 | Random Forest | 0.934 |
| 3 | Logistic Regression | 0.9025 |
| 4 | Gradient Boosting | 0.92825 |
In [45]:
# affichage avec tri selon les performances
DFR.sort!(df_results,:accuracy,rev=true)
df_results
4×2 DataFrame
| Row | model | accuracy |
|---|---|---|
| String | Float64 | |
| 1 | Random Forest | 0.934 |
| 2 | Gradient Boosting | 0.92825 |
| 3 | Logistic Regression | 0.9025 |
| 4 | Decision Tree | 0.87275 |
In [46]:
# représentation graphique
import Plots
Plots.bar(df_results.model,df_results.accuracy,legend=false)