Fonction pour l’évaluation des performances

# Q.0
# fonction pour calcul du taux d'erreur
# et intervalle de confiance à 90%
# similaire à "caret" mais qui, lui, calcule sur le taux de reconnaissance
# prend en entrée les classes observées et prédites par le modèle
taux_erreur <- function(y_obs,y_pred){
  mc <- table(y_obs,y_pred)
  #affichage MC
  print("Matrice de confusion :")
  print(mc)
  #mal classés
  wrong <- sum(mc) - sum(diag(mc))
  print(paste("Mal classés :",wrong))
  #taux d'erreur
  err <- wrong/sum(mc)
  print(paste("Err :",err))
  #intervalle de confiance
  #ex. http://www.chups.jussieu.fr/polys/biostats/poly/POLY.Chp.10.3.html
  #effectif
  n <- sum(mc)
  #écart-type
  et <- sqrt(err*(1-err)/n)
  #quantile loi normale 95%
  z <- qnorm(0.95)
  #borne basse
  bb <- err - z * et
  bh <- err + z * et
  #affichage bornes intervalle
  print("Bornes intervalle de confiance")
  print(c(bb,bh))
}

Importation et préparation des données

# Q.1+ Q.2 - chargement des données
DAll <- read.table(file="kr-vs-kp.txt",header=T,sep="\t",encoding="UTF-8",stringsAsFactors=TRUE)

# carac. de la base
str(DAll)
## 'data.frame':    3196 obs. of  35 variables:
##  $ bkblk : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ bknwy : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ bkon8 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ bkona : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ bkspr : Factor w/ 2 levels "n","y": 1 2 2 1 1 1 1 2 1 1 ...
##  $ bkxbq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ bkxcr : Factor w/ 2 levels "n","y": 1 1 2 1 1 1 1 1 1 2 ...
##  $ bkxwp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ blxwp : Factor w/ 2 levels "n","y": 1 1 1 2 1 1 2 1 1 1 ...
##  $ bxqsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cntxt : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 2 2 2 ...
##  $ dsopp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hdchk : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mulch : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ qxmsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ r2ar8 : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ reskd : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ reskr : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rimmx : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rkxwp : Factor w/ 2 levels "n","y": 1 1 1 2 1 1 2 1 1 1 ...
##  $ rxmsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ simpl : Factor w/ 2 levels "n","y": 1 1 1 1 2 2 2 1 1 1 ...
##  $ skach : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ skewr : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ skrxp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ spcop : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stlmt : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ thrsk : Factor w/ 2 levels "n","y": 1 1 1 1 1 2 1 1 1 1 ...
##  $ wkcti : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 2 1 ...
##  $ wkna8 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wknck : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wkovl : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ wkpos : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ wtoeg : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ classe: Factor w/ 2 levels "nowin","won": 2 2 2 2 2 2 2 2 2 2 ...
# Q.3 - partition apprentissage-test des données

# effectifs
nAll <- nrow(DAll)
nTrain <- 2196
nTest <- nAll-nTrain

# Q.3a + 3b -index pour partition
set.seed(1)
index <- sample(x=seq_len(nAll),size=nTrain)
print(head(index))
## [1] 1017  679 2177  930 1533  471
# Q.3c + 3e - échantillon d'apprentissage
DTrain <- DAll[index,]
print(dim(DTrain))
## [1] 2196   35
# Q.3d + 3e - échantillon test
DTest <- DAll[-index,] #utilisation des indices négatifs...
print(dim(DTest))
## [1] 1000   35

Modélisation - Totalité des variables

# Q.4 - régression logistique sur l'ensemble des variables
mAll <- glm(classe ~ ., data = DTrain, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(mAll)
## 
## Call:  glm(formula = classe ~ ., family = binomial, data = DTrain)
## 
## Coefficients:
## (Intercept)       bkblky       bknwyy       bkon8y       bkonay       bkspry  
##     4.80047      3.49034      6.45755     -4.22553     -2.67393     -0.80189  
##      bkxbqy       bkxcry       bkxwpy       blxwpy       bxqsqy       cntxty  
##     3.27775     -0.59419      0.65682     -0.98790    -33.01926     -1.75915  
##      dsoppy       hdchky       mulchy       qxmsqy       r2ar8y       reskdy  
##     1.52538    -22.74617     -7.54754      5.50558     -1.32849      1.12830  
##      reskry       rimmxy       rkxwpy       rxmsqy       simply       skachy  
##    -0.92118     70.24379      0.35735     -5.02557     -0.46956     -3.07929  
##      skewry       skrxpy       spcopy       stlmty       thrsky       wkctiy  
##     0.21506     -1.35029    -23.43190    -11.45899     -2.50196      1.28097  
##      wkna8y       wkncky       wkovly       wkposy       wtoegy  
##   -13.63994     -9.42168     -1.58089      2.33728     -0.09817  
## 
## Degrees of Freedom: 2195 Total (i.e. Null);  2161 Residual
## Null Deviance:       3033 
## Residual Deviance: 390   AIC: 460
# Q.5 - performances
taux_erreur(DTest$classe,ifelse(predict(mAll,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   484  22
##   won      11 483
## [1] "Mal classés : 33"
## [1] "Err : 0.033"
## [1] "Bornes intervalle de confiance"
## [1] 0.02370825 0.04229175

Sélection avec stepAIC

# Q.6 + Q.7 -- sélection BACKWARD -- critère BIC
# Attention, warning = FALSE parce que message de fitted.proba ~ 0 ou 1
library(MASS)
mBack <- stepAIC(mAll,direction="backward",k=log(nTrain),trace=FALSE)
print(mBack)
## 
## Call:  glm(formula = classe ~ bkblk + bknwy + bkon8 + bkona + bkxbq + 
##     bxqsq + cntxt + dsopp + mulch + qxmsq + r2ar8 + rimmx + rxmsq + 
##     wkcti + wkna8 + wknck + wkpos, family = binomial, data = DTrain)
## 
## Coefficients:
## (Intercept)       bkblky       bknwyy       bkon8y       bkonay       bkxbqy  
##       2.586        2.555        6.445       -3.493       -2.218        3.377  
##      bxqsqy       cntxty       dsoppy       mulchy       qxmsqy       r2ar8y  
##     -31.279       -1.404        1.519       -7.351        5.797       -2.061  
##      rimmxy       rxmsqy       wkctiy       wkna8y       wkncky       wkposy  
##      65.868       -4.156        1.243      -10.790       -8.413        2.225  
## 
## Degrees of Freedom: 2195 Total (i.e. Null);  2178 Residual
## Null Deviance:       3033 
## Residual Deviance: 438.2     AIC: 474.2
# Q.8 - nombre de variables sélectionnées
print(length(mBack$coefficients)-1)
## [1] 17
# Q.9 -- liste des variables retenues dans le modèle définitif
varBack <- sapply(tail(names(mBack$coefficients),length(mBack$coefficients)-1),function(x){substr(x,1,nchar(x)-1)})
names(varBack) <- NULL
print(varBack)
##  [1] "bkblk" "bknwy" "bkon8" "bkona" "bkxbq" "bxqsq" "cntxt" "dsopp" "mulch"
## [10] "qxmsq" "r2ar8" "rimmx" "rxmsq" "wkcti" "wkna8" "wknck" "wkpos"
# Q.10 - ordre de retrait des variables
print(mBack$anova)
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## classe ~ bkblk + bknwy + bkon8 + bkona + bkspr + bkxbq + bkxcr + 
##     bkxwp + blxwp + bxqsq + cntxt + dsopp + hdchk + mulch + qxmsq + 
##     r2ar8 + reskd + reskr + rimmx + rkxwp + rxmsq + simpl + skach + 
##     skewr + skrxp + spcop + stlmt + thrsk + wkcti + wkna8 + wknck + 
##     wkovl + wkpos + wtoeg
## 
## Final Model:
## classe ~ bkblk + bknwy + bkon8 + bkona + bkxbq + bxqsq + cntxt + 
##     dsopp + mulch + qxmsq + r2ar8 + rimmx + rxmsq + wkcti + wkna8 + 
##     wknck + wkpos
## 
## 
##       Step Df    Deviance Resid. Df Resid. Dev      AIC
## 1                              2161   389.9969 659.3007
## 2  - stlmt  1 0.001472105      2162   389.9984 651.6077
## 3  - wtoeg  1 0.051040339      2163   390.0494 643.9644
## 4  - rkxwp  1 0.247524104      2164   390.2969 636.5175
## 5  - skewr  1 0.255775632      2165   390.5527 629.0789
## 6  - reskd  1 0.511966992      2166   391.0647 621.8965
## 7  - bkxwp  1 0.618743194      2167   391.6834 614.8208
## 8  - bkxcr  1 1.447424048      2168   393.1309 608.5739
## 9  - spcop  1 2.327741994      2169   395.4586 603.2072
## 10 - simpl  1 2.408947382      2170   397.8675 597.9218
## 11 - skrxp  1 3.173146804      2171   401.0407 593.4005
## 12 - reskr  1 3.679780623      2172   404.7205 589.3859
## 13 - skach  1 3.873527648      2173   408.5940 585.5650
## 14 - blxwp  1 4.498759060      2174   413.0928 582.3694
## 15 - hdchk  1 4.370751433      2175   417.4635 579.0458
## 16 - wkovl  1 6.909197415      2176   424.3727 578.2606
## 17 - bkspr  1 6.861564661      2177   431.2343 577.4277
## 18 - thrsk  1 6.963804325      2178   438.1981 576.6971
# Q.11 - évolution du BIC en fonction du retrait des variables
# plot, rotation des labels
# https://stackoverflow.com/questions/5182238/replace-x-axis-with-own-values
plot(mBack$anova[,'AIC'],type="b",axes=FALSE,ylab="BIC",xlab="",main="Evolution BIC")
axis(1,at=1:length(mBack$anova[,'Step']),labels=mBack$anova[,'Step'],las=2,cex.axis=0.75)
axis(2)

# Q.12 + Q.13 -- performances
# pas de différences significatives en performances du modèle intégrant toutes les variables
# mais ce modèle s'appuie avec moitié moins de variables
# plus intéressant parce que plus facile à interpréter, plus facile à déployer
taux_erreur(DTest$classe,ifelse(predict(mBack,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   484  22
##   won      17 477
## [1] "Mal classés : 39"
## [1] "Err : 0.039"
## [1] "Bornes intervalle de confiance"
## [1] 0.02893019 0.04906981
# Q.14 - sélection FORWARD - critère BIC

# départ : modele trivial
mNull <- glm(classe ~ 1, data = DTrain, family = binomial)

# sélection forward BIC
mFwd <- stepAIC(mNull,scope=list(lower=formula(mNull),upper=formula(mAll)),direction="forward",k=log(nTrain),trace=FALSE)
print(mFwd)
## 
## Call:  glm(formula = classe ~ rimmx + bxqsq + wknck + wkna8 + bkxbq + 
##     wkpos + r2ar8 + rxmsq + qxmsq + bkspr + mulch + bknwy + bkon8 + 
##     bkblk + bkona + cntxt + thrsk, family = binomial, data = DTrain)
## 
## Coefficients:
## (Intercept)       rimmxy       bxqsqy       wkncky       wkna8y       bkxbqy  
##       3.765       65.371      -31.904       -8.413      -10.983        3.234  
##      wkposy       r2ar8y       rxmsqy       qxmsqy       bkspry       mulchy  
##       2.064       -2.322       -4.383        5.513       -1.082       -7.517  
##      bknwyy       bkon8y       bkblky       bkonay       cntxty       thrsky  
##       6.417       -3.769        2.564       -2.059       -1.351       -1.796  
## 
## Degrees of Freedom: 2195 Total (i.e. Null);  2178 Residual
## Null Deviance:       3033 
## Residual Deviance: 438.5     AIC: 474.5
# variables sélectionnées
varFwd <- sapply(tail(names(mFwd$coefficients),length(mFwd$coefficients)-1),function(x){substr(x,1,nchar(x)-1)})
names(varFwd) <- NULL
print(varFwd)
##  [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxbq" "wkpos" "r2ar8" "rxmsq" "qxmsq"
## [10] "bkspr" "mulch" "bknwy" "bkon8" "bkblk" "bkona" "cntxt" "thrsk"
# nombre de variables
print(length(varFwd))
## [1] 17
# ordre d'introduction des variables
print(mFwd$anova)
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## classe ~ 1
## 
## Final Model:
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxbq + wkpos + r2ar8 + 
##     rxmsq + qxmsq + bkspr + mulch + bknwy + bkon8 + bkblk + bkona + 
##     cntxt + thrsk
## 
## 
##       Step Df   Deviance Resid. Df Resid. Dev       AIC
## 1                             2195  3033.4939 3041.1883
## 2  + rimmx  1 589.312725      2194  2444.1812 2459.5700
## 3  + bxqsq  1 697.643484      2193  1746.5377 1769.6209
## 4  + wknck  1 642.573874      2192  1103.9638 1134.7414
## 5  + wkna8  1 275.941712      2191   828.0221  866.4941
## 6  + bkxbq  1  85.953088      2190   742.0690  788.2354
## 7  + wkpos  1  50.015314      2189   692.0537  745.9145
## 8  + r2ar8  1  48.867858      2188   643.1859  704.7410
## 9  + rxmsq  1  33.680305      2187   609.5056  678.7551
## 10 + qxmsq  1  39.074398      2186   570.4312  647.3751
## 11 + bkspr  1  22.645183      2185   547.7860  632.4243
## 12 + mulch  1  13.952396      2184   533.8336  626.1663
## 13 + bknwy  1  35.627716      2183   498.2059  598.2330
## 14 + bkon8  1   9.341919      2182   488.8639  596.5854
## 15 + bkblk  1  15.003463      2181   473.8605  589.2764
## 16 + bkona  1  19.068895      2180   454.7916  577.9019
## 17 + cntxt  1   8.297971      2179   446.4936  577.2983
## 18 + thrsk  1   8.015254      2178   438.4784  576.9774
# performances en test
# mêmes perfs. que pour l'approche backward
taux_erreur(DTest$classe,ifelse(predict(mFwd,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   478  28
##   won      11 483
## [1] "Mal classés : 39"
## [1] "Err : 0.039"
## [1] "Bornes intervalle de confiance"
## [1] 0.02893019 0.04906981
# Q.15 -- étude des variables sélectionnées

# variables en commun - forward et backward
print(intersect(varFwd,varBack))
##  [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxbq" "wkpos" "r2ar8" "rxmsq" "qxmsq"
## [10] "mulch" "bknwy" "bkon8" "bkblk" "bkona" "cntxt"
# croisement des variables : présent dans forward, mais pas dans backward
print(setdiff(varFwd,varBack))
## [1] "bkspr" "thrsk"
# croisement des variables : présent dans backward, mais pas dans forward
print(setdiff(varBack,varFwd))
## [1] "dsopp" "wkcti"

Conclusion : une méthode de sélection présente un scénario de solution. Différents critères pour comparer : nombre de variables, performances en généralisation, à égalité sur les critères numériques : interprétabilité des résultats.

Méthodes filtres pour la sélection de variables

# Q.16 - package FSelector
library(FSelector)
# Q.17 - importance des variables selon le critère SU
importance <- FSelector::symmetrical.uncertainty(classe ~ ., data = DTrain)
print(importance)
##       attr_importance
## bkblk    3.424633e-09
## bknwy    2.337413e-04
## bkon8    1.163370e-02
## bkona    1.394065e-04
## bkspr    4.673889e-05
## bkxbq    1.050509e-02
## bkxcr    1.504816e-02
## bkxwp    4.071475e-02
## blxwp    3.305556e-03
## bxqsq    1.186935e-01
## cntxt    2.556719e-03
## dsopp    7.157315e-05
## hdchk    8.778565e-03
## mulch    3.158791e-02
## qxmsq    4.519091e-05
## r2ar8    2.006349e-02
## reskd    2.839338e-03
## reskr    1.457706e-04
## rimmx    2.293502e-01
## rkxwp    1.138004e-02
## rxmsq    9.694662e-03
## simpl    2.380870e-04
## skach    3.016341e-03
## skewr    2.336260e-03
## skrxp    1.699078e-02
## spcop    1.004453e-03
## stlmt    2.511110e-02
## thrsk    3.511458e-03
## wkcti    8.861802e-03
## wkna8    4.956890e-02
## wknck    1.094321e-01
## wkovl    2.446534e-03
## wkpos    1.554507e-02
## wtoeg    4.677794e-05
# Q.18 - afficher dans l'ordre d'importance décroissante
idx <- order(importance$attr_importance,decreasing = TRUE)
sortedImp <- data.frame(var=rownames(importance)[idx],importance=importance$attr_importance[idx])
print(sortedImp)
##      var   importance
## 1  rimmx 2.293502e-01
## 2  bxqsq 1.186935e-01
## 3  wknck 1.094321e-01
## 4  wkna8 4.956890e-02
## 5  bkxwp 4.071475e-02
## 6  mulch 3.158791e-02
## 7  stlmt 2.511110e-02
## 8  r2ar8 2.006349e-02
## 9  skrxp 1.699078e-02
## 10 wkpos 1.554507e-02
## 11 bkxcr 1.504816e-02
## 12 bkon8 1.163370e-02
## 13 rkxwp 1.138004e-02
## 14 bkxbq 1.050509e-02
## 15 rxmsq 9.694662e-03
## 16 wkcti 8.861802e-03
## 17 hdchk 8.778565e-03
## 18 thrsk 3.511458e-03
## 19 blxwp 3.305556e-03
## 20 skach 3.016341e-03
## 21 reskd 2.839338e-03
## 22 cntxt 2.556719e-03
## 23 wkovl 2.446534e-03
## 24 skewr 2.336260e-03
## 25 spcop 1.004453e-03
## 26 simpl 2.380870e-04
## 27 bknwy 2.337413e-04
## 28 reskr 1.457706e-04
## 29 bkona 1.394065e-04
## 30 dsopp 7.157315e-05
## 31 wtoeg 4.677794e-05
## 32 bkspr 4.673889e-05
## 33 qxmsq 4.519091e-05
## 34 bkblk 3.424633e-09
# Q.19 - représentation graphique
barplot(rev(sortedImp$importance),horiz=TRUE,names.arg = rev(sortedImp$var),cex.names=0.9,las=2)

# Q.20 - récupérons les 17 meilleures variables 
selImportance <- FSelector::cutoff.k(importance,k=17)
print(selImportance)
##  [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxwp" "mulch" "stlmt" "r2ar8" "skrxp"
## [10] "wkpos" "bkxcr" "bkon8" "rkxwp" "bkxbq" "rxmsq" "wkcti" "hdchk"
# en commun avec le backward
print(intersect(selImportance,varBack))
##  [1] "rimmx" "bxqsq" "wknck" "wkna8" "mulch" "r2ar8" "wkpos" "bkon8" "bkxbq"
## [10] "rxmsq" "wkcti"
# en commun avec le forward
print(intersect(selImportance,varFwd))
##  [1] "rimmx" "bxqsq" "wknck" "wkna8" "mulch" "r2ar8" "wkpos" "bkon8" "bkxbq"
## [10] "rxmsq"
# Q.21 - convertir en formule
frmSelImp <- as.simple.formula(selImportance,"classe")
print(frmSelImp)
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk
## <environment: 0x00000000152e5348>
# Q.22 - lancer la régression avec ces variables
mSelImp <- glm(frmSelImp, data = DTrain, family = binomial)
print(mSelImp)
## 
## Call:  glm(formula = frmSelImp, family = binomial, data = DTrain)
## 
## Coefficients:
## (Intercept)       rimmxy       bxqsqy       wkncky       wkna8y       bkxwpy  
##      2.2548      50.8144     -24.4936      -6.6446      -5.3288      -0.1035  
##      mulchy       stlmty       r2ar8y       skrxpy       wkposy       bkxcry  
##     -2.6899     -16.4674      -1.7242      -1.4142       2.3054      -0.8092  
##      bkon8y       rkxwpy       bkxbqy       rxmsqy       wkctiy       hdchky  
##     -1.6816      -0.4965       2.3322      -2.5753       1.1486     -19.4787  
## 
## Degrees of Freedom: 2195 Total (i.e. Null);  2178 Residual
## Null Deviance:       3033 
## Residual Deviance: 552   AIC: 588
# performances en test -- à nombre de variables égales à backward et forward
# le modèle est légèrement moins bon
# pas significativement (statistiquement, les intervalles de confiance se chevauchent)
# mais il y a quand même 8 individus mal classés supplémentaires
# 14 si on se réfère au modèle initial comprenant toutes les variables
taux_erreur(DTest$classe,ifelse(predict(mSelImp,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   470  36
##   won      11 483
## [1] "Mal classés : 47"
## [1] "Err : 0.047"
## [1] "Bornes intervalle de confiance"
## [1] 0.03599164 0.05800836
# Q.23 - sélection CFS, prise en compte des redondances
selCFS <- FSelector::cfs(classe ~ ., data = DTrain)

# nombre de variables sélectionnées
print(length(selCFS))
## [1] 3
# liste
print(selCFS)
## [1] "bxqsq" "rimmx" "wknck"
# Q.24 - formule corresp.
frmCFS <- as.simple.formula(selCFS,"classe")
print(frmCFS)
## classe ~ bxqsq + rimmx + wknck
## <environment: 0x0000000013af7c50>
# lancer la régression avec ces variables
mCFS <- glm(frmCFS, data = DTrain, family = binomial)
print(mCFS)
## 
## Call:  glm(formula = frmCFS, family = binomial, data = DTrain)
## 
## Coefficients:
## (Intercept)       bxqsqy       rimmxy       wkncky  
##       1.478      -20.505       41.096       -3.775  
## 
## Degrees of Freedom: 2195 Total (i.e. Null);  2192 Residual
## Null Deviance:       3033 
## Residual Deviance: 1104  AIC: 1112
# performances en test
# manifestement, l'approche CFS n'est absolument adaptée à notre problème
# en tous les cas, ne sélectionne pas les variables bons pour la régression logistique
# ==> trop peu de variables sélectionnées
taux_erreur(DTest$classe,ifelse(predict(mCFS,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   413  93
##   won      10 484
## [1] "Mal classés : 103"
## [1] "Err : 0.103"
## [1] "Bornes intervalle de confiance"
## [1] 0.08718963 0.11881037

Ranking + Wrapper

# Q.25 - utilisation de caret
library(caret)
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : ggplot2
# Q.26
# p nombre de variables candidates
p <- ncol(DTrain)-1

# vecteur de taux de succes
succes <- matrix(0,nrow=p,ncol=2)
# Q.27 - test des différentes configurations avec une boucle
for (j in 1:p){
  #j meilleures variables
  best <- cutoff.k(importance,k=j)
  #formule avec j variables
  formule <- as.simple.formula(best,"classe")
  print(formule)
  #modélisation avec caret
  lrm <- train(formule,data=DTrain,method="glm",trControl=trainControl(method="cv",number=5))
  #évaluation
  succes[j,1] <- lrm$results$Accuracy
  succes[j,2] <- lrm$results$AccuracySD
}
## classe ~ rimmx
## <environment: 0x000000006c62c3e0>
## classe ~ rimmx + bxqsq
## <environment: 0x00000000700dec30>
## classe ~ rimmx + bxqsq + wknck
## <environment: 0x000000007004ce48>
## classe ~ rimmx + bxqsq + wknck + wkna8
## <environment: 0x0000000070017478>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp
## <environment: 0x000000006ff63140>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch
## <environment: 0x000000006f9dda78>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt
## <environment: 0x000000006f9cfea8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8
## <environment: 0x000000006f9601a8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp
## <environment: 0x000000006f94ade8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos
## <environment: 0x000000006f8dfc88>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr
## <environment: 0x000000006f89ecb8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8
## <environment: 0x000000006f83ce08>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp
## <environment: 0x000000006f7e4d90>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq
## <environment: 0x000000006f7b6e40>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq
## <environment: 0x000000006f762cf0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti
## <environment: 0x000000006f701ce0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk
## <environment: 0x000000006f6dc0b8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk
## <environment: 0x000000006f6c71b8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp
## <environment: 0x000000006f69c7c8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach
## <environment: 0x000000006f64a0e0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd
## <environment: 0x000000006f629c38>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt
## <environment: 0x000000006f589a60>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl
## <environment: 0x000000006f50f710>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr
## <environment: 0x000000006f4e10e0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop
## <environment: 0x000000006f4ab7f0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl
## <environment: 0x000000006f493e80>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy
## <environment: 0x000000006f492d70>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr
## <environment: 0x000000007023f6e8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona
## <environment: 0x00000000700b05c8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp
## <environment: 0x00000000700d3708>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg
## <environment: 0x0000000070158818>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg + 
##     bkspr
## <environment: 0x00000000701982b0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg + 
##     bkspr + qxmsq
## <environment: 0x00000000701d86c0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg + 
##     bkspr + qxmsq + bkblk
## <environment: 0x000000006f4997e0>
# Q.28 - graphique des << intervalles de confiance >>
# au sens de +/- 1 écart-type autour de la moyenne
# https://stackoverflow.com/questions/14069629/how-can-i-plot-data-with-confidence-intervals
plot(1:p,succes[,1],type="n",xlab="# de variables",ylab="Accuracy",main="Ranking + Wrapper (cross-validation)",ylim=c(0.65,1.0))
polygon(c(rev(1:p),1:p),c(rev(succes[,1]-succes[,2]),succes[,1]+succes[,2]),col = 'lavender', border = NA)
lines(1:p,succes[,1],type="l",lwd=2,col="lightgoldenrod4")
lines(1:p,succes[,1],type="b",lwd=2,col="lightgoldenrod4",cex=0.8)
lines(1:p,succes[,1]-succes[,2],type="l",col="lightpink3",lty=2)
lines(1:p,succes[,1]+succes[,2],type="l",col="lightpink3",lty=2)

# Q.29 - sélection de la combinaison "optimale"
# id du max.
K.optim <- which.max(succes[,1])
print(K.optim)
## [1] 34
# choisir les k meilleurs variables - mais toutes en fait dans notre exercice
bestK <- cutoff.k(importance,k=K.optim)
formuleK <- as.simple.formula(bestK,"classe")
print(formuleK)
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt + 
##     r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq + 
##     wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl + 
##     skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg + 
##     bkspr + qxmsq + bkblk
## <environment: 0x000000006b630e40>
# modéliser...
mBestK <- glm(formuleK, data = DTrain, family = binomial)

# ... et évaluer en test
taux_erreur(DTest$classe,ifelse(predict(mBestK,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
##        y_pred
## y_obs   nowin won
##   nowin   484  22
##   won      11 483
## [1] "Mal classés : 33"
## [1] "Err : 0.033"
## [1] "Bornes intervalle de confiance"
## [1] 0.02370825 0.04229175