# Q.0
# fonction pour calcul du taux d'erreur
# et intervalle de confiance à 90%
# similaire à "caret" mais qui, lui, calcule sur le taux de reconnaissance
# prend en entrée les classes observées et prédites par le modèle
taux_erreur <- function(y_obs,y_pred){
mc <- table(y_obs,y_pred)
#affichage MC
print("Matrice de confusion :")
print(mc)
#mal classés
wrong <- sum(mc) - sum(diag(mc))
print(paste("Mal classés :",wrong))
#taux d'erreur
err <- wrong/sum(mc)
print(paste("Err :",err))
#intervalle de confiance
#ex. http://www.chups.jussieu.fr/polys/biostats/poly/POLY.Chp.10.3.html
#effectif
n <- sum(mc)
#écart-type
et <- sqrt(err*(1-err)/n)
#quantile loi normale 95%
z <- qnorm(0.95)
#borne basse
bb <- err - z * et
bh <- err + z * et
#affichage bornes intervalle
print("Bornes intervalle de confiance")
print(c(bb,bh))
}
# Q.1+ Q.2 - chargement des données
DAll <- read.table(file="kr-vs-kp.txt",header=T,sep="\t",encoding="UTF-8",stringsAsFactors=TRUE)
# carac. de la base
str(DAll)
## 'data.frame': 3196 obs. of 35 variables:
## $ bkblk : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ bknwy : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ bkon8 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ bkona : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ bkspr : Factor w/ 2 levels "n","y": 1 2 2 1 1 1 1 2 1 1 ...
## $ bkxbq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ bkxcr : Factor w/ 2 levels "n","y": 1 1 2 1 1 1 1 1 1 2 ...
## $ bkxwp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ blxwp : Factor w/ 2 levels "n","y": 1 1 1 2 1 1 2 1 1 1 ...
## $ bxqsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ cntxt : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 2 2 2 ...
## $ dsopp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ hdchk : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ mulch : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ qxmsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ r2ar8 : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ reskd : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ reskr : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ rimmx : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ rkxwp : Factor w/ 2 levels "n","y": 1 1 1 2 1 1 2 1 1 1 ...
## $ rxmsq : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ simpl : Factor w/ 2 levels "n","y": 1 1 1 1 2 2 2 1 1 1 ...
## $ skach : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ skewr : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ skrxp : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ spcop : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ stlmt : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ thrsk : Factor w/ 2 levels "n","y": 1 1 1 1 1 2 1 1 1 1 ...
## $ wkcti : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 2 1 ...
## $ wkna8 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ wknck : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ wkovl : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ wkpos : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ wtoeg : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ classe: Factor w/ 2 levels "nowin","won": 2 2 2 2 2 2 2 2 2 2 ...
# Q.3 - partition apprentissage-test des données
# effectifs
nAll <- nrow(DAll)
nTrain <- 2196
nTest <- nAll-nTrain
# Q.3a + 3b -index pour partition
set.seed(1)
index <- sample(x=seq_len(nAll),size=nTrain)
print(head(index))
## [1] 1017 679 2177 930 1533 471
# Q.3c + 3e - échantillon d'apprentissage
DTrain <- DAll[index,]
print(dim(DTrain))
## [1] 2196 35
# Q.3d + 3e - échantillon test
DTest <- DAll[-index,] #utilisation des indices négatifs...
print(dim(DTest))
## [1] 1000 35
# Q.4 - régression logistique sur l'ensemble des variables
mAll <- glm(classe ~ ., data = DTrain, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(mAll)
##
## Call: glm(formula = classe ~ ., family = binomial, data = DTrain)
##
## Coefficients:
## (Intercept) bkblky bknwyy bkon8y bkonay bkspry
## 4.80047 3.49034 6.45755 -4.22553 -2.67393 -0.80189
## bkxbqy bkxcry bkxwpy blxwpy bxqsqy cntxty
## 3.27775 -0.59419 0.65682 -0.98790 -33.01926 -1.75915
## dsoppy hdchky mulchy qxmsqy r2ar8y reskdy
## 1.52538 -22.74617 -7.54754 5.50558 -1.32849 1.12830
## reskry rimmxy rkxwpy rxmsqy simply skachy
## -0.92118 70.24379 0.35735 -5.02557 -0.46956 -3.07929
## skewry skrxpy spcopy stlmty thrsky wkctiy
## 0.21506 -1.35029 -23.43190 -11.45899 -2.50196 1.28097
## wkna8y wkncky wkovly wkposy wtoegy
## -13.63994 -9.42168 -1.58089 2.33728 -0.09817
##
## Degrees of Freedom: 2195 Total (i.e. Null); 2161 Residual
## Null Deviance: 3033
## Residual Deviance: 390 AIC: 460
# Q.5 - performances
taux_erreur(DTest$classe,ifelse(predict(mAll,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 484 22
## won 11 483
## [1] "Mal classés : 33"
## [1] "Err : 0.033"
## [1] "Bornes intervalle de confiance"
## [1] 0.02370825 0.04229175
# Q.6 + Q.7 -- sélection BACKWARD -- critère BIC
# Attention, warning = FALSE parce que message de fitted.proba ~ 0 ou 1
library(MASS)
mBack <- stepAIC(mAll,direction="backward",k=log(nTrain),trace=FALSE)
print(mBack)
##
## Call: glm(formula = classe ~ bkblk + bknwy + bkon8 + bkona + bkxbq +
## bxqsq + cntxt + dsopp + mulch + qxmsq + r2ar8 + rimmx + rxmsq +
## wkcti + wkna8 + wknck + wkpos, family = binomial, data = DTrain)
##
## Coefficients:
## (Intercept) bkblky bknwyy bkon8y bkonay bkxbqy
## 2.586 2.555 6.445 -3.493 -2.218 3.377
## bxqsqy cntxty dsoppy mulchy qxmsqy r2ar8y
## -31.279 -1.404 1.519 -7.351 5.797 -2.061
## rimmxy rxmsqy wkctiy wkna8y wkncky wkposy
## 65.868 -4.156 1.243 -10.790 -8.413 2.225
##
## Degrees of Freedom: 2195 Total (i.e. Null); 2178 Residual
## Null Deviance: 3033
## Residual Deviance: 438.2 AIC: 474.2
# Q.8 - nombre de variables sélectionnées
print(length(mBack$coefficients)-1)
## [1] 17
# Q.9 -- liste des variables retenues dans le modèle définitif
varBack <- sapply(tail(names(mBack$coefficients),length(mBack$coefficients)-1),function(x){substr(x,1,nchar(x)-1)})
names(varBack) <- NULL
print(varBack)
## [1] "bkblk" "bknwy" "bkon8" "bkona" "bkxbq" "bxqsq" "cntxt" "dsopp" "mulch"
## [10] "qxmsq" "r2ar8" "rimmx" "rxmsq" "wkcti" "wkna8" "wknck" "wkpos"
# Q.10 - ordre de retrait des variables
print(mBack$anova)
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## classe ~ bkblk + bknwy + bkon8 + bkona + bkspr + bkxbq + bkxcr +
## bkxwp + blxwp + bxqsq + cntxt + dsopp + hdchk + mulch + qxmsq +
## r2ar8 + reskd + reskr + rimmx + rkxwp + rxmsq + simpl + skach +
## skewr + skrxp + spcop + stlmt + thrsk + wkcti + wkna8 + wknck +
## wkovl + wkpos + wtoeg
##
## Final Model:
## classe ~ bkblk + bknwy + bkon8 + bkona + bkxbq + bxqsq + cntxt +
## dsopp + mulch + qxmsq + r2ar8 + rimmx + rxmsq + wkcti + wkna8 +
## wknck + wkpos
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 2161 389.9969 659.3007
## 2 - stlmt 1 0.001472105 2162 389.9984 651.6077
## 3 - wtoeg 1 0.051040339 2163 390.0494 643.9644
## 4 - rkxwp 1 0.247524104 2164 390.2969 636.5175
## 5 - skewr 1 0.255775632 2165 390.5527 629.0789
## 6 - reskd 1 0.511966992 2166 391.0647 621.8965
## 7 - bkxwp 1 0.618743194 2167 391.6834 614.8208
## 8 - bkxcr 1 1.447424048 2168 393.1309 608.5739
## 9 - spcop 1 2.327741994 2169 395.4586 603.2072
## 10 - simpl 1 2.408947382 2170 397.8675 597.9218
## 11 - skrxp 1 3.173146804 2171 401.0407 593.4005
## 12 - reskr 1 3.679780623 2172 404.7205 589.3859
## 13 - skach 1 3.873527648 2173 408.5940 585.5650
## 14 - blxwp 1 4.498759060 2174 413.0928 582.3694
## 15 - hdchk 1 4.370751433 2175 417.4635 579.0458
## 16 - wkovl 1 6.909197415 2176 424.3727 578.2606
## 17 - bkspr 1 6.861564661 2177 431.2343 577.4277
## 18 - thrsk 1 6.963804325 2178 438.1981 576.6971
# Q.11 - évolution du BIC en fonction du retrait des variables
# plot, rotation des labels
# https://stackoverflow.com/questions/5182238/replace-x-axis-with-own-values
plot(mBack$anova[,'AIC'],type="b",axes=FALSE,ylab="BIC",xlab="",main="Evolution BIC")
axis(1,at=1:length(mBack$anova[,'Step']),labels=mBack$anova[,'Step'],las=2,cex.axis=0.75)
axis(2)
# Q.12 + Q.13 -- performances
# pas de différences significatives en performances du modèle intégrant toutes les variables
# mais ce modèle s'appuie avec moitié moins de variables
# plus intéressant parce que plus facile à interpréter, plus facile à déployer
taux_erreur(DTest$classe,ifelse(predict(mBack,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 484 22
## won 17 477
## [1] "Mal classés : 39"
## [1] "Err : 0.039"
## [1] "Bornes intervalle de confiance"
## [1] 0.02893019 0.04906981
# Q.14 - sélection FORWARD - critère BIC
# départ : modele trivial
mNull <- glm(classe ~ 1, data = DTrain, family = binomial)
# sélection forward BIC
mFwd <- stepAIC(mNull,scope=list(lower=formula(mNull),upper=formula(mAll)),direction="forward",k=log(nTrain),trace=FALSE)
print(mFwd)
##
## Call: glm(formula = classe ~ rimmx + bxqsq + wknck + wkna8 + bkxbq +
## wkpos + r2ar8 + rxmsq + qxmsq + bkspr + mulch + bknwy + bkon8 +
## bkblk + bkona + cntxt + thrsk, family = binomial, data = DTrain)
##
## Coefficients:
## (Intercept) rimmxy bxqsqy wkncky wkna8y bkxbqy
## 3.765 65.371 -31.904 -8.413 -10.983 3.234
## wkposy r2ar8y rxmsqy qxmsqy bkspry mulchy
## 2.064 -2.322 -4.383 5.513 -1.082 -7.517
## bknwyy bkon8y bkblky bkonay cntxty thrsky
## 6.417 -3.769 2.564 -2.059 -1.351 -1.796
##
## Degrees of Freedom: 2195 Total (i.e. Null); 2178 Residual
## Null Deviance: 3033
## Residual Deviance: 438.5 AIC: 474.5
# variables sélectionnées
varFwd <- sapply(tail(names(mFwd$coefficients),length(mFwd$coefficients)-1),function(x){substr(x,1,nchar(x)-1)})
names(varFwd) <- NULL
print(varFwd)
## [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxbq" "wkpos" "r2ar8" "rxmsq" "qxmsq"
## [10] "bkspr" "mulch" "bknwy" "bkon8" "bkblk" "bkona" "cntxt" "thrsk"
# nombre de variables
print(length(varFwd))
## [1] 17
# ordre d'introduction des variables
print(mFwd$anova)
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## classe ~ 1
##
## Final Model:
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxbq + wkpos + r2ar8 +
## rxmsq + qxmsq + bkspr + mulch + bknwy + bkon8 + bkblk + bkona +
## cntxt + thrsk
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 2195 3033.4939 3041.1883
## 2 + rimmx 1 589.312725 2194 2444.1812 2459.5700
## 3 + bxqsq 1 697.643484 2193 1746.5377 1769.6209
## 4 + wknck 1 642.573874 2192 1103.9638 1134.7414
## 5 + wkna8 1 275.941712 2191 828.0221 866.4941
## 6 + bkxbq 1 85.953088 2190 742.0690 788.2354
## 7 + wkpos 1 50.015314 2189 692.0537 745.9145
## 8 + r2ar8 1 48.867858 2188 643.1859 704.7410
## 9 + rxmsq 1 33.680305 2187 609.5056 678.7551
## 10 + qxmsq 1 39.074398 2186 570.4312 647.3751
## 11 + bkspr 1 22.645183 2185 547.7860 632.4243
## 12 + mulch 1 13.952396 2184 533.8336 626.1663
## 13 + bknwy 1 35.627716 2183 498.2059 598.2330
## 14 + bkon8 1 9.341919 2182 488.8639 596.5854
## 15 + bkblk 1 15.003463 2181 473.8605 589.2764
## 16 + bkona 1 19.068895 2180 454.7916 577.9019
## 17 + cntxt 1 8.297971 2179 446.4936 577.2983
## 18 + thrsk 1 8.015254 2178 438.4784 576.9774
# performances en test
# mêmes perfs. que pour l'approche backward
taux_erreur(DTest$classe,ifelse(predict(mFwd,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 478 28
## won 11 483
## [1] "Mal classés : 39"
## [1] "Err : 0.039"
## [1] "Bornes intervalle de confiance"
## [1] 0.02893019 0.04906981
# Q.15 -- étude des variables sélectionnées
# variables en commun - forward et backward
print(intersect(varFwd,varBack))
## [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxbq" "wkpos" "r2ar8" "rxmsq" "qxmsq"
## [10] "mulch" "bknwy" "bkon8" "bkblk" "bkona" "cntxt"
# croisement des variables : présent dans forward, mais pas dans backward
print(setdiff(varFwd,varBack))
## [1] "bkspr" "thrsk"
# croisement des variables : présent dans backward, mais pas dans forward
print(setdiff(varBack,varFwd))
## [1] "dsopp" "wkcti"
Conclusion : une méthode de sélection présente un scénario de solution. Différents critères pour comparer : nombre de variables, performances en généralisation, à égalité sur les critères numériques : interprétabilité des résultats.
# Q.16 - package FSelector
library(FSelector)
# Q.17 - importance des variables selon le critère SU
importance <- FSelector::symmetrical.uncertainty(classe ~ ., data = DTrain)
print(importance)
## attr_importance
## bkblk 3.424633e-09
## bknwy 2.337413e-04
## bkon8 1.163370e-02
## bkona 1.394065e-04
## bkspr 4.673889e-05
## bkxbq 1.050509e-02
## bkxcr 1.504816e-02
## bkxwp 4.071475e-02
## blxwp 3.305556e-03
## bxqsq 1.186935e-01
## cntxt 2.556719e-03
## dsopp 7.157315e-05
## hdchk 8.778565e-03
## mulch 3.158791e-02
## qxmsq 4.519091e-05
## r2ar8 2.006349e-02
## reskd 2.839338e-03
## reskr 1.457706e-04
## rimmx 2.293502e-01
## rkxwp 1.138004e-02
## rxmsq 9.694662e-03
## simpl 2.380870e-04
## skach 3.016341e-03
## skewr 2.336260e-03
## skrxp 1.699078e-02
## spcop 1.004453e-03
## stlmt 2.511110e-02
## thrsk 3.511458e-03
## wkcti 8.861802e-03
## wkna8 4.956890e-02
## wknck 1.094321e-01
## wkovl 2.446534e-03
## wkpos 1.554507e-02
## wtoeg 4.677794e-05
# Q.18 - afficher dans l'ordre d'importance décroissante
idx <- order(importance$attr_importance,decreasing = TRUE)
sortedImp <- data.frame(var=rownames(importance)[idx],importance=importance$attr_importance[idx])
print(sortedImp)
## var importance
## 1 rimmx 2.293502e-01
## 2 bxqsq 1.186935e-01
## 3 wknck 1.094321e-01
## 4 wkna8 4.956890e-02
## 5 bkxwp 4.071475e-02
## 6 mulch 3.158791e-02
## 7 stlmt 2.511110e-02
## 8 r2ar8 2.006349e-02
## 9 skrxp 1.699078e-02
## 10 wkpos 1.554507e-02
## 11 bkxcr 1.504816e-02
## 12 bkon8 1.163370e-02
## 13 rkxwp 1.138004e-02
## 14 bkxbq 1.050509e-02
## 15 rxmsq 9.694662e-03
## 16 wkcti 8.861802e-03
## 17 hdchk 8.778565e-03
## 18 thrsk 3.511458e-03
## 19 blxwp 3.305556e-03
## 20 skach 3.016341e-03
## 21 reskd 2.839338e-03
## 22 cntxt 2.556719e-03
## 23 wkovl 2.446534e-03
## 24 skewr 2.336260e-03
## 25 spcop 1.004453e-03
## 26 simpl 2.380870e-04
## 27 bknwy 2.337413e-04
## 28 reskr 1.457706e-04
## 29 bkona 1.394065e-04
## 30 dsopp 7.157315e-05
## 31 wtoeg 4.677794e-05
## 32 bkspr 4.673889e-05
## 33 qxmsq 4.519091e-05
## 34 bkblk 3.424633e-09
# Q.19 - représentation graphique
barplot(rev(sortedImp$importance),horiz=TRUE,names.arg = rev(sortedImp$var),cex.names=0.9,las=2)
# Q.20 - récupérons les 17 meilleures variables
selImportance <- FSelector::cutoff.k(importance,k=17)
print(selImportance)
## [1] "rimmx" "bxqsq" "wknck" "wkna8" "bkxwp" "mulch" "stlmt" "r2ar8" "skrxp"
## [10] "wkpos" "bkxcr" "bkon8" "rkxwp" "bkxbq" "rxmsq" "wkcti" "hdchk"
# en commun avec le backward
print(intersect(selImportance,varBack))
## [1] "rimmx" "bxqsq" "wknck" "wkna8" "mulch" "r2ar8" "wkpos" "bkon8" "bkxbq"
## [10] "rxmsq" "wkcti"
# en commun avec le forward
print(intersect(selImportance,varFwd))
## [1] "rimmx" "bxqsq" "wknck" "wkna8" "mulch" "r2ar8" "wkpos" "bkon8" "bkxbq"
## [10] "rxmsq"
# Q.21 - convertir en formule
frmSelImp <- as.simple.formula(selImportance,"classe")
print(frmSelImp)
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk
## <environment: 0x00000000152e5348>
# Q.22 - lancer la régression avec ces variables
mSelImp <- glm(frmSelImp, data = DTrain, family = binomial)
print(mSelImp)
##
## Call: glm(formula = frmSelImp, family = binomial, data = DTrain)
##
## Coefficients:
## (Intercept) rimmxy bxqsqy wkncky wkna8y bkxwpy
## 2.2548 50.8144 -24.4936 -6.6446 -5.3288 -0.1035
## mulchy stlmty r2ar8y skrxpy wkposy bkxcry
## -2.6899 -16.4674 -1.7242 -1.4142 2.3054 -0.8092
## bkon8y rkxwpy bkxbqy rxmsqy wkctiy hdchky
## -1.6816 -0.4965 2.3322 -2.5753 1.1486 -19.4787
##
## Degrees of Freedom: 2195 Total (i.e. Null); 2178 Residual
## Null Deviance: 3033
## Residual Deviance: 552 AIC: 588
# performances en test -- à nombre de variables égales à backward et forward
# le modèle est légèrement moins bon
# pas significativement (statistiquement, les intervalles de confiance se chevauchent)
# mais il y a quand même 8 individus mal classés supplémentaires
# 14 si on se réfère au modèle initial comprenant toutes les variables
taux_erreur(DTest$classe,ifelse(predict(mSelImp,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 470 36
## won 11 483
## [1] "Mal classés : 47"
## [1] "Err : 0.047"
## [1] "Bornes intervalle de confiance"
## [1] 0.03599164 0.05800836
# Q.23 - sélection CFS, prise en compte des redondances
selCFS <- FSelector::cfs(classe ~ ., data = DTrain)
# nombre de variables sélectionnées
print(length(selCFS))
## [1] 3
# liste
print(selCFS)
## [1] "bxqsq" "rimmx" "wknck"
# Q.24 - formule corresp.
frmCFS <- as.simple.formula(selCFS,"classe")
print(frmCFS)
## classe ~ bxqsq + rimmx + wknck
## <environment: 0x0000000013af7c50>
# lancer la régression avec ces variables
mCFS <- glm(frmCFS, data = DTrain, family = binomial)
print(mCFS)
##
## Call: glm(formula = frmCFS, family = binomial, data = DTrain)
##
## Coefficients:
## (Intercept) bxqsqy rimmxy wkncky
## 1.478 -20.505 41.096 -3.775
##
## Degrees of Freedom: 2195 Total (i.e. Null); 2192 Residual
## Null Deviance: 3033
## Residual Deviance: 1104 AIC: 1112
# performances en test
# manifestement, l'approche CFS n'est absolument adaptée à notre problème
# en tous les cas, ne sélectionne pas les variables bons pour la régression logistique
# ==> trop peu de variables sélectionnées
taux_erreur(DTest$classe,ifelse(predict(mCFS,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 413 93
## won 10 484
## [1] "Mal classés : 103"
## [1] "Err : 0.103"
## [1] "Bornes intervalle de confiance"
## [1] 0.08718963 0.11881037
# Q.25 - utilisation de caret
library(caret)
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : ggplot2
# Q.26
# p nombre de variables candidates
p <- ncol(DTrain)-1
# vecteur de taux de succes
succes <- matrix(0,nrow=p,ncol=2)
# Q.27 - test des différentes configurations avec une boucle
for (j in 1:p){
#j meilleures variables
best <- cutoff.k(importance,k=j)
#formule avec j variables
formule <- as.simple.formula(best,"classe")
print(formule)
#modélisation avec caret
lrm <- train(formule,data=DTrain,method="glm",trControl=trainControl(method="cv",number=5))
#évaluation
succes[j,1] <- lrm$results$Accuracy
succes[j,2] <- lrm$results$AccuracySD
}
## classe ~ rimmx
## <environment: 0x000000006c62c3e0>
## classe ~ rimmx + bxqsq
## <environment: 0x00000000700dec30>
## classe ~ rimmx + bxqsq + wknck
## <environment: 0x000000007004ce48>
## classe ~ rimmx + bxqsq + wknck + wkna8
## <environment: 0x0000000070017478>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp
## <environment: 0x000000006ff63140>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch
## <environment: 0x000000006f9dda78>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt
## <environment: 0x000000006f9cfea8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8
## <environment: 0x000000006f9601a8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp
## <environment: 0x000000006f94ade8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos
## <environment: 0x000000006f8dfc88>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr
## <environment: 0x000000006f89ecb8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8
## <environment: 0x000000006f83ce08>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp
## <environment: 0x000000006f7e4d90>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq
## <environment: 0x000000006f7b6e40>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq
## <environment: 0x000000006f762cf0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti
## <environment: 0x000000006f701ce0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk
## <environment: 0x000000006f6dc0b8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk
## <environment: 0x000000006f6c71b8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp
## <environment: 0x000000006f69c7c8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach
## <environment: 0x000000006f64a0e0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd
## <environment: 0x000000006f629c38>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt
## <environment: 0x000000006f589a60>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl
## <environment: 0x000000006f50f710>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr
## <environment: 0x000000006f4e10e0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop
## <environment: 0x000000006f4ab7f0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl
## <environment: 0x000000006f493e80>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy
## <environment: 0x000000006f492d70>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr
## <environment: 0x000000007023f6e8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona
## <environment: 0x00000000700b05c8>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp
## <environment: 0x00000000700d3708>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg
## <environment: 0x0000000070158818>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg +
## bkspr
## <environment: 0x00000000701982b0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg +
## bkspr + qxmsq
## <environment: 0x00000000701d86c0>
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg +
## bkspr + qxmsq + bkblk
## <environment: 0x000000006f4997e0>
# Q.28 - graphique des << intervalles de confiance >>
# au sens de +/- 1 écart-type autour de la moyenne
# https://stackoverflow.com/questions/14069629/how-can-i-plot-data-with-confidence-intervals
plot(1:p,succes[,1],type="n",xlab="# de variables",ylab="Accuracy",main="Ranking + Wrapper (cross-validation)",ylim=c(0.65,1.0))
polygon(c(rev(1:p),1:p),c(rev(succes[,1]-succes[,2]),succes[,1]+succes[,2]),col = 'lavender', border = NA)
lines(1:p,succes[,1],type="l",lwd=2,col="lightgoldenrod4")
lines(1:p,succes[,1],type="b",lwd=2,col="lightgoldenrod4",cex=0.8)
lines(1:p,succes[,1]-succes[,2],type="l",col="lightpink3",lty=2)
lines(1:p,succes[,1]+succes[,2],type="l",col="lightpink3",lty=2)
# Q.29 - sélection de la combinaison "optimale"
# id du max.
K.optim <- which.max(succes[,1])
print(K.optim)
## [1] 34
# choisir les k meilleurs variables - mais toutes en fait dans notre exercice
bestK <- cutoff.k(importance,k=K.optim)
formuleK <- as.simple.formula(bestK,"classe")
print(formuleK)
## classe ~ rimmx + bxqsq + wknck + wkna8 + bkxwp + mulch + stlmt +
## r2ar8 + skrxp + wkpos + bkxcr + bkon8 + rkxwp + bkxbq + rxmsq +
## wkcti + hdchk + thrsk + blxwp + skach + reskd + cntxt + wkovl +
## skewr + spcop + simpl + bknwy + reskr + bkona + dsopp + wtoeg +
## bkspr + qxmsq + bkblk
## <environment: 0x000000006b630e40>
# modéliser...
mBestK <- glm(formuleK, data = DTrain, family = binomial)
# ... et évaluer en test
taux_erreur(DTest$classe,ifelse(predict(mBestK,DTest,type="response")>0.5,"won","nowin"))
## [1] "Matrice de confusion :"
## y_pred
## y_obs nowin won
## nowin 484 22
## won 11 483
## [1] "Mal classés : 33"
## [1] "Err : 0.033"
## [1] "Bornes intervalle de confiance"
## [1] 0.02370825 0.04229175