Environnement - Packages¶

In [27]:
# activer l'environnement
using Pkg
Pkg.activate("env_mlj_comparison")
  Activating project at `c:\Users\ricco\Desktop\demo\env_mlj_comparison`
In [28]:
# liste des packages installés
Pkg.status()
Status `C:\Users\ricco\Desktop\demo\env_mlj_comparison\Project.toml`
  [324d7699] CategoricalArrays v1.1.0
  [a93c6f00] DataFrames v1.8.2
  [7806a523] DecisionTree v0.12.4
  [f6006082] EvoTrees v0.18.5
  [7073ff75] IJulia v1.34.4
  [b1bec4e5] LIBSVM v0.8.1
  [add582a8] MLJ v0.23.2
  [c6f25543] MLJDecisionTreeInterface v0.4.4
  [61c7150f] MLJLIBSVMInterface v0.2.2
  [6ee0df7b] MLJLinearModels v0.10.1
  [91a5bcdd] Plots v1.41.6
  [30f210dd] ScientificTypesBase v3.1.0
  [2913bbd2] StatsBase v0.34.10
  [fdbf4ff8] XLSX v0.11.3

Importation et préparation des données¶

Importation, inspection¶

In [29]:
# packages
import DataFrames as DFR
import XLSX

# lecture des données
df = DFR.DataFrame(XLSX.readtable("./spambase.xlsx"))

# premières lignes
println(DFR.describe(df))
56×7 DataFrame
 Row │ variable                    mean        min  median  max     nmissing  eltype   
     │ Symbol                      Union…      Any  Union…  Any     Int64     DataType 
─────┼─────────────────────────────────────────────────────────────────────────────────
   1 │ wf_make                     0.104553    0.0  0.0     4.54           0  Float64
   2 │ wf_address                  0.213015    0.0  0.0     14.28          0  Float64
   3 │ wf_all                      0.280656    0.0  0.0     5.1            0  Float64
   4 │ wf_3d                       0.0654249   0.0  0.0     42.81          0  Float64
   5 │ wf_our                      0.312223    0.0  0.0     10.0           0  Float64
   6 │ wf_over                     0.0959009   0.0  0.0     5.88           0  Float64
   7 │ wf_remove                   0.114208    0.0  0.0     7.27           0  Float64
   8 │ wf_internet                 0.105295    0.0  0.0     11.11          0  Float64
   9 │ wf_order                    0.0900674   0.0  0.0     5.26           0  Float64
  10 │ wf_mail                     0.239413    0.0  0.0     18.18          0  Float64
  11 │ wf_receive                  0.059824    0.0  0.0     2.61           0  Float64
  12 │ wf_will                     0.541702    0.0  0.1     9.67           0  Float64
  13 │ wf_people                   0.0939296   0.0  0.0     5.55           0  Float64
  14 │ wf_report                   0.0586264   0.0  0.0     10.0           0  Float64
  15 │ wf_addresses                0.0492045   0.0  0.0     4.41           0  Float64
  16 │ wf_free                     0.248848    0.0  0.0     20.0           0  Float64
  17 │ wf_business                 0.142586    0.0  0.0     7.14           0  Float64
  18 │ wf_email                    0.184745    0.0  0.0     9.09           0  Float64
  19 │ wf_you                      1.6621      0.0  1.31    18.75          0  Float64
  20 │ wf_credit                   0.085577    0.0  0.0     18.18          0  Float64
  21 │ wf_your                     0.809761    0.0  0.22    11.11          0  Float64
  22 │ wf_font                     0.121202    0.0  0.0     17.1           0  Float64
  23 │ wf_000                      0.101645    0.0  0.0     5.45           0  Float64
  24 │ wf_money                    0.0942686   0.0  0.0     12.5           0  Float64
  25 │ wf_hp                       0.549504    0.0  0.0     20.83          0  Float64
  26 │ wf_hpl                      0.265384    0.0  0.0     16.66          0  Float64
  27 │ wf_lab                      0.0989155   0.0  0.0     14.28          0  Float64
  28 │ wf_labs                     0.102852    0.0  0.0     5.88           0  Float64
  29 │ wf_telnet                   0.0647533   0.0  0.0     12.5           0  Float64
  30 │ wf_857                      0.0470485   0.0  0.0     4.76           0  Float64
  31 │ wf_data                     0.0972289   0.0  0.0     18.18          0  Float64
  32 │ wf_415                      0.0478353   0.0  0.0     4.76           0  Float64
  33 │ wf_85                       0.105412    0.0  0.0     20.0           0  Float64
  34 │ wf_technology               0.0974766   0.0  0.0     7.69           0  Float64
  35 │ wf_1999                     0.136953    0.0  0.0     6.89           0  Float64
  36 │ wf_parts                    0.0132015   0.0  0.0     8.33           0  Float64
  37 │ wf_pm                       0.0786286   0.0  0.0     11.11          0  Float64
  38 │ wf_direct                   0.0648337   0.0  0.0     4.76           0  Float64
  39 │ wf_cs                       0.0436666   0.0  0.0     7.14           0  Float64
  40 │ wf_meeting                  0.132339    0.0  0.0     14.28          0  Float64
  41 │ wf_original                 0.0460987   0.0  0.0     3.57           0  Float64
  42 │ wf_project                  0.0791958   0.0  0.0     20.0           0  Float64
  43 │ wf_re                       0.301224    0.0  0.0     21.42          0  Float64
  44 │ wf_edu                      0.179824    0.0  0.0     22.05          0  Float64
  45 │ wf_table                    0.00544447  0.0  0.0     2.17           0  Float64
  46 │ wf_conference               0.0318692   0.0  0.0     10.0           0  Float64
  47 │ cf_;                        0.0385747   0.0  0.0     4.385          0  Float64
  48 │ cf_(                        0.13903     0.0  0.065   9.752          0  Float64
  49 │ cf_[                        0.0169759   0.0  0.0     4.081          0  Float64
  50 │ cf_!                        0.269071    0.0  0.0     32.478         0  Float64
  51 │ cf_$                        0.0758107   0.0  0.0     6.003          0  Float64
  52 │ cf_#                        0.0442382   0.0  0.0     19.829         0  Float64
  53 │ capital_run_length_average  5.19152     1.0  2.276   1102.5         0  Float64
  54 │ capital_run_length_longest  52.1728     1    15.0    9989           0  Int64
  55 │ capital_run_length_total    283.289     1    95.0    15841          0  Int64
  56 │ spam                                    no           yes            0  String
In [30]:
# dimension
println(DFR.size(df))
(4601, 56)
In [31]:
# retirer la limitation d'affichage
ENV["LINES"] = 1000
# vérifier le schéma de la base
# et les types scientifiques
import MLJ
MLJ.schema(df)
┌────────────────────────────┬────────────┬─────────┐
│ names                      │ scitypes   │ types   │
├────────────────────────────┼────────────┼─────────┤
│ wf_make                    │ Continuous │ Float64 │
│ wf_address                 │ Continuous │ Float64 │
│ wf_all                     │ Continuous │ Float64 │
│ wf_3d                      │ Continuous │ Float64 │
│ wf_our                     │ Continuous │ Float64 │
│ wf_over                    │ Continuous │ Float64 │
│ wf_remove                  │ Continuous │ Float64 │
│ wf_internet                │ Continuous │ Float64 │
│ wf_order                   │ Continuous │ Float64 │
│ wf_mail                    │ Continuous │ Float64 │
│ wf_receive                 │ Continuous │ Float64 │
│ wf_will                    │ Continuous │ Float64 │
│ wf_people                  │ Continuous │ Float64 │
│ wf_report                  │ Continuous │ Float64 │
│ wf_addresses               │ Continuous │ Float64 │
│ wf_free                    │ Continuous │ Float64 │
│ wf_business                │ Continuous │ Float64 │
│ wf_email                   │ Continuous │ Float64 │
│ wf_you                     │ Continuous │ Float64 │
│ wf_credit                  │ Continuous │ Float64 │
│ wf_your                    │ Continuous │ Float64 │
│ wf_font                    │ Continuous │ Float64 │
│ wf_000                     │ Continuous │ Float64 │
│ wf_money                   │ Continuous │ Float64 │
│ wf_hp                      │ Continuous │ Float64 │
│ wf_hpl                     │ Continuous │ Float64 │
│ wf_lab                     │ Continuous │ Float64 │
│ wf_labs                    │ Continuous │ Float64 │
│ wf_telnet                  │ Continuous │ Float64 │
│ wf_857                     │ Continuous │ Float64 │
│ wf_data                    │ Continuous │ Float64 │
│ wf_415                     │ Continuous │ Float64 │
│ wf_85                      │ Continuous │ Float64 │
│ wf_technology              │ Continuous │ Float64 │
│ wf_1999                    │ Continuous │ Float64 │
│ wf_parts                   │ Continuous │ Float64 │
│ wf_pm                      │ Continuous │ Float64 │
│ wf_direct                  │ Continuous │ Float64 │
│ wf_cs                      │ Continuous │ Float64 │
│ wf_meeting                 │ Continuous │ Float64 │
│ wf_original                │ Continuous │ Float64 │
│ wf_project                 │ Continuous │ Float64 │
│ wf_re                      │ Continuous │ Float64 │
│ wf_edu                     │ Continuous │ Float64 │
│ wf_table                   │ Continuous │ Float64 │
│ wf_conference              │ Continuous │ Float64 │
│ cf_;                       │ Continuous │ Float64 │
│ cf_(                       │ Continuous │ Float64 │
│ cf_[                       │ Continuous │ Float64 │
│ cf_!                       │ Continuous │ Float64 │
│ cf_$                       │ Continuous │ Float64 │
│ cf_#                       │ Continuous │ Float64 │
│ capital_run_length_average │ Continuous │ Float64 │
│ capital_run_length_longest │ Count      │ Int64   │
│ capital_run_length_total   │ Count      │ Int64   │
│ spam                       │ Textual    │ String  │
└────────────────────────────┴────────────┴─────────┘

Préparation des structures¶

In [32]:
# isoler y et X dans des structures distinctes
y, X = MLJ.unpack(df,==(:spam))

# dimensions
println("Dim. de y = $(DFR.size(y))")
println("Dim. de X = $(DFR.size(X))")
Dim. de y = (4601,)
Dim. de X = (4601, 55)

Ajustement du type des variables¶

In [33]:
# convertir y en variable catégorielle pour la rég. logistique
# équivalent du type factor sous R
# utilisation du package CategoricalArrays
import CategoricalArrays as CA
y = CA.categorical(y)

# vérification des modalités
CA.levels(y)
2-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
 "no"
 "yes"
In [34]:
# fréquences des classes
DFR.combine(DFR.groupby(DFR.DataFrame(y=y),:y),DFR.nrow => :freq)
2×2 DataFrame
Rowyfreq
Cat…Int64
1no2788
2yes1813
In [35]:
# transformer les X count en variables continues
# utilisation du package ScientificTypesBase
import ScientificTypesBase as STB
X = MLJ.coerce(X,STB.Count => STB.Continuous)

# schéma
MLJ.schema(X)
┌────────────────────────────┬────────────┬─────────┐
│ names                      │ scitypes   │ types   │
├────────────────────────────┼────────────┼─────────┤
│ wf_make                    │ Continuous │ Float64 │
│ wf_address                 │ Continuous │ Float64 │
│ wf_all                     │ Continuous │ Float64 │
│ wf_3d                      │ Continuous │ Float64 │
│ wf_our                     │ Continuous │ Float64 │
│ wf_over                    │ Continuous │ Float64 │
│ wf_remove                  │ Continuous │ Float64 │
│ wf_internet                │ Continuous │ Float64 │
│ wf_order                   │ Continuous │ Float64 │
│ wf_mail                    │ Continuous │ Float64 │
│ wf_receive                 │ Continuous │ Float64 │
│ wf_will                    │ Continuous │ Float64 │
│ wf_people                  │ Continuous │ Float64 │
│ wf_report                  │ Continuous │ Float64 │
│ wf_addresses               │ Continuous │ Float64 │
│ wf_free                    │ Continuous │ Float64 │
│ wf_business                │ Continuous │ Float64 │
│ wf_email                   │ Continuous │ Float64 │
│ wf_you                     │ Continuous │ Float64 │
│ wf_credit                  │ Continuous │ Float64 │
│ wf_your                    │ Continuous │ Float64 │
│ wf_font                    │ Continuous │ Float64 │
│ wf_000                     │ Continuous │ Float64 │
│ wf_money                   │ Continuous │ Float64 │
│ wf_hp                      │ Continuous │ Float64 │
│ wf_hpl                     │ Continuous │ Float64 │
│ wf_lab                     │ Continuous │ Float64 │
│ wf_labs                    │ Continuous │ Float64 │
│ wf_telnet                  │ Continuous │ Float64 │
│ wf_857                     │ Continuous │ Float64 │
│ wf_data                    │ Continuous │ Float64 │
│ wf_415                     │ Continuous │ Float64 │
│ wf_85                      │ Continuous │ Float64 │
│ wf_technology              │ Continuous │ Float64 │
│ wf_1999                    │ Continuous │ Float64 │
│ wf_parts                   │ Continuous │ Float64 │
│ wf_pm                      │ Continuous │ Float64 │
│ wf_direct                  │ Continuous │ Float64 │
│ wf_cs                      │ Continuous │ Float64 │
│ wf_meeting                 │ Continuous │ Float64 │
│ wf_original                │ Continuous │ Float64 │
│ wf_project                 │ Continuous │ Float64 │
│ wf_re                      │ Continuous │ Float64 │
│ wf_edu                     │ Continuous │ Float64 │
│ wf_table                   │ Continuous │ Float64 │
│ wf_conference              │ Continuous │ Float64 │
│ cf_;                       │ Continuous │ Float64 │
│ cf_(                       │ Continuous │ Float64 │
│ cf_[                       │ Continuous │ Float64 │
│ cf_!                       │ Continuous │ Float64 │
│ cf_$                       │ Continuous │ Float64 │
│ cf_#                       │ Continuous │ Float64 │
│ capital_run_length_average │ Continuous │ Float64 │
│ capital_run_length_longest │ Continuous │ Float64 │
│ capital_run_length_total   │ Continuous │ Float64 │
└────────────────────────────┴────────────┴─────────┘

Partition en TRAIN/TEST¶

In [36]:
# effectif
n = DFR.nrow(X) 
println(n)
4601
In [37]:
# identifiants -> échantillonner 601 parmi 4601, sans remise
import StatsBase
idTrain = StatsBase.sample(1:n,601,replace=false)
println(length(idTrain))
601
In [38]:
# et les individus en test
idTest = setdiff(1:n,idTrain)
println(length(idTest))
4000
In [39]:
# structures y et X pour train/test
# par indexation avec les indices
yTrain, yTest = y[idTrain], y[idTest]
XTrain, XTest = X[idTrain,:], X[idTest,:]

# afficher les dimensions pour vérifications
println("Dim. de y = $(DFR.size(yTrain)) et $(DFR.size(yTest))")
println("Dim. de X = $(DFR.size(XTrain)) et $(DFR.size(XTest))")
Dim. de y = (601,) et (4000,)
Dim. de X = (601, 55) et (4000, 55)

Comparaison des algorithmes (schéma holdout)¶

Chargement des modèles (à partir des packages spécialisés)¶

In [40]:
# chargement des modèles à comparer
DecisionTreeClassifier  = @MLJ.load DecisionTreeClassifier pkg=DecisionTree
RandomForestClassifier  = @MLJ.load RandomForestClassifier pkg=DecisionTree
LogisticClassifier      = @MLJ.load LogisticClassifier pkg=MLJLinearModels
LinearSVC               = @MLJ.load LinearSVC pkg=LIBSVM
EvoTreeClassifier       = @MLJ.load EvoTreeClassifier pkg=EvoTrees
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
import MLJDecisionTreeInterface ✔
import MLJDecisionTreeInterface ✔
import MLJLinearModels ✔
import MLJLIBSVMInterface ✔
import EvoTrees ✔
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\9LbNu\src\loading.jl:159
EvoTrees.EvoTreeClassifier

Instanciation (paramètres par défaut)¶

In [41]:
# instanciation des modèles et intégration
# dans une structure de type liste de tuples
models = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Logistic Regression", LogisticClassifier()),
    ("Gradient Boosting", EvoTreeClassifier())
]
4-element Vector{Tuple{String, MLJModelInterface.Probabilistic}}:
 ("Decision Tree", DecisionTreeClassifier(max_depth = -1, …))
 ("Random Forest", RandomForestClassifier(max_depth = -1, …))
 ("Logistic Regression", LogisticClassifier(lambda = 2.220446049250313e-16, …))
 ("Gradient Boosting", EvoTreeClassifier(loss = mlogloss, …))

Inspection des paramètres¶

In [42]:
# paramètres de l'arbre par exemple
# instancié avec les paramètres par défaut
println("*** Parametre (par défaut) de << Arbre de décision ***")
for (param,value) in pairs(MLJ.params(DecisionTreeClassifier()))
    println("$param = $value")
end
*** Parametre (par défaut) de << Arbre de décision ***
max_depth = -1
min_samples_leaf = 1
min_samples_split = 2
min_purity_increase = 0.0
n_subfeatures = 0
post_prune = false
merge_purity_threshold = 1.0
display_depth = 5
feature_importance = impurity
rng = Random.TaskLocalRNG()
In [43]:
# paramètres pour l'ensemble des algos instaciés
# disponibles dans la liste << models >>
for i in 1:length(models)
    println()
    println("*** Parametre de << $(models[i][1]) >> ***")
    for (param,value) in pairs(MLJ.params(models[i][2]))
        println("$param = $value")
    end
end
*** Parametre de << Decision Tree >> ***
max_depth = -1
min_samples_leaf = 1
min_samples_split = 2
min_purity_increase = 0.0
n_subfeatures = 0
post_prune = false
merge_purity_threshold = 1.0
display_depth = 5
feature_importance = impurity
rng = Random.TaskLocalRNG()

*** Parametre de << Random Forest >> ***
max_depth = -1
min_samples_leaf = 1
min_samples_split = 2
min_purity_increase = 0.0
n_subfeatures = -1
n_trees = 100
sampling_fraction = 0.7
feature_importance = impurity
rng = Random.TaskLocalRNG()

*** Parametre de << Logistic Regression >> ***
lambda = 2.220446049250313e-16
gamma = 0.0
penalty = l2
fit_intercept = true
penalize_intercept = false
scale_penalty_with_samples = true
solver = nothing

*** Parametre de << Gradient Boosting >> ***
loss = mlogloss
metric = mlogloss
nrounds = 100
bagging_size = 1
early_stopping_rounds = 9223372036854775807
L2 = 1.0
lambda = 0.0
gamma = 0.0
eta = 0.1
max_depth = 6
min_weight = 1.0
rowsample = 1.0
colsample = 1.0
nbins = 64
tree_type = binary
seed = 123
device = cpu

Expérimentation¶

In [44]:
# lancer l'expérimentation
# structure de récupération des résultats -> liste
results = []

# pour chaque algo
for (name, model) in models
    # préparation - entraînement
    mach = MLJ.machine(model, XTrain, yTrain)
    MLJ.fit!(mach, verbosity=0)

    # prédiction en test
    ypred = MLJ.predict(mach, XTest) # proba d'appartenance
    ypred = MLJ.mode.(ypred)   # convertir probabilités -> classes

    # accuracy
    acc = MLJ.accuracy(ypred, yTest)

    # rajouter le résultat sous la forme de named-tuple
    # dans la liste
    push!(results, (model=name, accuracy=acc))
end

# transformation de la liste en dataframe
df_results = DFR.DataFrame(results)
4×2 DataFrame
Rowmodelaccuracy
StringFloat64
1Decision Tree0.87275
2Random Forest0.934
3Logistic Regression0.9025
4Gradient Boosting0.92825
In [45]:
# affichage avec tri selon les performances
DFR.sort!(df_results,:accuracy,rev=true)
df_results
4×2 DataFrame
Rowmodelaccuracy
StringFloat64
1Random Forest0.934
2Gradient Boosting0.92825
3Logistic Regression0.9025
4Decision Tree0.87275
In [46]:
# représentation graphique
import Plots
Plots.bar(df_results.model,df_results.accuracy,legend=false)
No description has been provided for this image