Importation et préparation des données

# Q.0 - importer l'échantillon d'apprentissage
# situé dans la première feuille
library(xlsx)
DTrain <- read.xlsx("waveform.xlsx",sheetIndex=1,stringsAsFactors=TRUE,encoding="UTF-8")

# description des données
str(DTrain)
## 'data.frame':    500 obs. of  8 variables:
##  $ V05   : num  1.54 0.95 2.48 4.35 4.04 3.89 2.7 2.53 2.68 1.44 ...
##  $ V07   : num  4.48 2.71 3.16 6.64 4.34 3.67 3.66 3.51 3.67 2.59 ...
##  $ V10   : num  1.51 2.09 2.06 3.57 2.61 2.62 3 2.12 2.91 2.11 ...
##  $ V11   : num  1.09 1.5 2.24 1.47 0.62 2.73 3.7 2.23 1.45 1.58 ...
##  $ V12   : num  1.32 1.09 2.87 0.95 1.81 1.61 3.19 2.81 1.18 -0.72 ...
##  $ V15   : num  2.25 2.85 3.42 1.13 -0.14 2.21 4.26 2.92 1.24 3.66 ...
##  $ V18   : num  0.53 1.87 1.58 -1.43 -0.77 0.2 1.95 3.11 0.51 0.36 ...
##  $ classe: Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
# Q.1 - stat. descriptives
# non, les variables explicatives ne sont pas centrées (moyenne <> 0)
# 173 de la classe "A", 157 de "B" et 170 de "C"
print(summary(DTrain))
##       V05              V07              V10              V11        
##  Min.   :-2.600   Min.   :-2.670   Min.   :-1.790   Min.   :-0.600  
##  1st Qu.:-0.095   1st Qu.: 1.150   1st Qu.: 1.877   1st Qu.: 2.185  
##  Median : 1.090   Median : 2.400   Median : 2.915   Median : 3.195  
##  Mean   : 1.255   Mean   : 2.598   Mean   : 2.938   Mean   : 3.323  
##  3rd Qu.: 2.520   3rd Qu.: 4.135   3rd Qu.: 3.935   3rd Qu.: 4.532  
##  Max.   : 5.920   Max.   : 7.690   Max.   : 6.770   Max.   : 8.050  
##       V12              V15              V18         classe 
##  Min.   :-0.780   Min.   :-2.140   Min.   :-2.740   A:173  
##  1st Qu.: 1.935   1st Qu.: 1.278   1st Qu.: 0.150   B:157  
##  Median : 2.985   Median : 2.735   Median : 1.105   C:170  
##  Mean   : 3.028   Mean   : 2.764   Mean   : 1.112          
##  3rd Qu.: 4.072   3rd Qu.: 4.263   3rd Qu.: 2.013          
##  Max.   : 7.300   Max.   : 7.960   Max.   : 5.260
# Q.2 - projeter dans le plan par paire
# on note que les différentes classes se distinguent bien
# selon les combinaisons de variables
pairs(DTrain[-ncol(DTrain)],col=c('black','red','green')[DTrain$classe],pch=19,cex=0.5)

# Q.3 + Q.4 -- centrer et réduire les explicatives
ZTrain <- scale(DTrain[-ncol(DTrain)],center=TRUE,scale=TRUE)

# summary des données
print(summary(ZTrain))
##       V05                V07               V10                V11         
##  Min.   :-2.28698   Min.   :-2.7023   Min.   :-3.14061   Min.   :-2.4250  
##  1st Qu.:-0.80080   1st Qu.:-0.7430   1st Qu.:-0.70449   1st Qu.:-0.7033  
##  Median :-0.09776   Median :-0.1018   Median :-0.01533   Median :-0.0789  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.75064   3rd Qu.: 0.7881   3rd Qu.: 0.66220   3rd Qu.: 0.7480  
##  Max.   : 2.76780   Max.   : 2.6116   Max.   : 2.54534   Max.   : 2.9225  
##       V12                V15                V18           
##  Min.   :-2.53965   Min.   :-2.47443   Min.   :-2.844481  
##  1st Qu.:-0.72896   1st Qu.:-0.75022   1st Qu.:-0.710315  
##  Median :-0.02869   Median :-0.01488   Median :-0.005081  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.69659   3rd Qu.: 0.75577   3rd Qu.: 0.665077  
##  Max.   : 2.84907   Max.   : 2.62124   Max.   : 3.063244
# Q.5 -- propriétés supplémentaires de l'objet
# fourni par scale()
print(attributes(ZTrain))
## $dim
## [1] 500   7
## 
## $dimnames
## $dimnames[[1]]
## NULL
## 
## $dimnames[[2]]
## [1] "V05" "V07" "V10" "V11" "V12" "V15" "V18"
## 
## 
## $`scaled:center`
##     V05     V07     V10     V11     V12     V15     V18 
## 1.25478 2.59846 2.93808 3.32262 3.02802 2.76450 1.11188 
## 
## $`scaled:scale`
##      V05      V07      V10      V11      V12      V15      V18 
## 1.685532 1.949606 1.505464 1.617583 1.499429 1.982077 1.354159
# Q.6
# accès à l'attribut scaled:center
# représentant les moyennes utilisées
print(attr(ZTrain,'scaled:center'))
##     V05     V07     V10     V11     V12     V15     V18 
## 1.25478 2.59846 2.93808 3.32262 3.02802 2.76450 1.11188
# et les écarts-type
print(attr(ZTrain,'scaled:scale'))
##      V05      V07      V10      V11      V12      V15      V18 
## 1.685532 1.949606 1.505464 1.617583 1.499429 1.982077 1.354159

Régression logistique avec “nnet”

# Q.7 -- librairie nnet
library(nnet)
# Q.8 - modéliser (l'algo est celui du perceptron en vrai - descente du gradient)
# il faut travailler sur les données standardisées parce que
# l'heuristique d'optimisation peut être gênée par les différences
# d'échelle entre les variables
# nous obtenons les coefficients estimés, la déviance, le critère AIC
# curieusement (?), nous ne disposons que de 2 équations pour 3 classes ???
mNet <- nnet::multinom(classe ~ ., data = cbind(ZTrain,DTrain['classe']))
## # weights:  27 (16 variable)
## initial  value 549.306144 
## iter  10 value 206.319054
## iter  20 value 179.322581
## final  value 178.812650 
## converged
print(mNet)
## Call:
## nnet::multinom(formula = classe ~ ., data = cbind(ZTrain, DTrain["classe"]))
## 
## Coefficients:
##   (Intercept)        V05        V07      V10      V11      V12         V15
## B  -0.6740575 -0.2869636  0.7071511 1.344863 1.791235 1.052232 -1.73265071
## C  -0.5643332 -2.1829750 -1.2259345 0.814202 1.397061 1.367325 -0.07096102
##          V18
## B -1.0414078
## C -0.2278076
## 
## Residual Deviance: 357.6253 
## AIC: 389.6253
# Q.9 - attributs de l'objet
print(attributes(mNet))
## $names
##  [1] "n"             "nunits"        "nconn"         "conn"         
##  [5] "nsunits"       "decay"         "entropy"       "softmax"      
##  [9] "censored"      "value"         "wts"           "convergence"  
## [13] "fitted.values" "residuals"     "lev"           "call"         
## [17] "terms"         "weights"       "deviance"      "rank"         
## [21] "lab"           "coefnames"     "vcoefnames"    "xlevels"      
## [25] "edf"           "AIC"          
## 
## $class
## [1] "multinom" "nnet"
# Q.10 - summary
sNet <- summary(mNet)
print(sNet)
## Call:
## nnet::multinom(formula = classe ~ ., data = cbind(ZTrain, DTrain["classe"]))
## 
## Coefficients:
##   (Intercept)        V05        V07      V10      V11      V12         V15
## B  -0.6740575 -0.2869636  0.7071511 1.344863 1.791235 1.052232 -1.73265071
## C  -0.5643332 -2.1829750 -1.2259345 0.814202 1.397061 1.367325 -0.07096102
##          V18
## B -1.0414078
## C -0.2278076
## 
## Std. Errors:
##   (Intercept)       V05       V07       V10       V11       V12      V15
## B   0.3789441 0.3154739 0.3281692 0.3012618 0.3080071 0.3228184 0.359432
## C   0.3713243 0.3358100 0.3547562 0.2813144 0.2992985 0.2988285 0.314818
##         V18
## B 0.2765070
## C 0.2326525
## 
## Residual Deviance: 357.6253 
## AIC: 389.6253
# Q.11 - attributs de l'objet summary
print(attributes(sNet))
## $names
##  [1] "n"               "nunits"          "nconn"           "conn"           
##  [5] "nsunits"         "decay"           "entropy"         "softmax"        
##  [9] "censored"        "value"           "wts"             "convergence"    
## [13] "fitted.values"   "residuals"       "lev"             "call"           
## [17] "terms"           "weights"         "deviance"        "rank"           
## [21] "lab"             "coefnames"       "vcoefnames"      "xlevels"        
## [25] "edf"             "AIC"             "is.binomial"     "digits"         
## [29] "coefficients"    "standard.errors"
## 
## $class
## [1] "summary.multinom"
# Q.12 - coefficients
print(sNet$coefficients)
##   (Intercept)        V05        V07      V10      V11      V12         V15
## B  -0.6740575 -0.2869636  0.7071511 1.344863 1.791235 1.052232 -1.73265071
## C  -0.5643332 -2.1829750 -1.2259345 0.814202 1.397061 1.367325 -0.07096102
##          V18
## B -1.0414078
## C -0.2278076
# type d'objet
print(class(sNet$coefficients))
## [1] "matrix" "array"
# Q.13 - probabilités approchées
# nous disposons des valeurs pour les 3 classes !
head(sNet$fitted.values)
##           A           B            C
## 1 0.9816813 0.016439970 1.878687e-03
## 2 0.9803203 0.004538032 1.514166e-02
## 3 0.9613791 0.023623419 1.499748e-02
## 4 0.3310797 0.668901640 1.870174e-05
## 5 0.6616134 0.338247655 1.389403e-04
## 6 0.8140623 0.184657167 1.280535e-03
# dimensions (nb. obs. apprentissage x nb. de classes)
print(dim(sNet$fitted.values))
## [1] 500   3
# Q.14 - vérifier que la some par ligne vaut bien 1
# c'est le cas, heureusement !
print(rowSums(sNet$fitted.values))
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
# Q.15
# vérifions pour le premier individu
# cours -- page 5

# Q.15a + Q.15b - logit modalité B + exp()
eB <- exp(sum(c(1,ZTrain[1,]) * sNet$coefficients["B",]))
print(eB)
## [1] 0.01674675
# modalité C
eC <- exp(sum(c(1,ZTrain[1,]) * sNet$coefficients["C",]))
print(eC)
## [1] 0.001913744
# Q.15c - proba B
pB <- eB/(1.0 + (eB + eC))
print(pB)
## [1] 0.01643997
# proba C
pC <- eC/(1.0 + (eB + eC))
print(pC)
## [1] 0.001878687
# Q.15d - proba (par différence)
pA <- 1.0 - (pB + pC)
print(pA)
## [1] 0.9816813
# Q.16
# vérifier la pertinence globale du modèle
# via le test du rapport de vraisemblance

# Q.16a - null modèle
mNull <- nnet::multinom(classe ~ 1, data = DTrain)
## # weights:  6 (2 variable)
## initial  value 549.306144 
## final  value 548.868278 
## converged
print(mNull)
## Call:
## nnet::multinom(formula = classe ~ 1, data = DTrain)
## 
## Coefficients:
##   (Intercept)
## B -0.09704616
## C -0.01749279
## 
## Residual Deviance: 1097.737 
## AIC: 1101.737
# Q.16b - statistique du test du rapport de vraisemblance
LRGlob <- mNull$deviance - mNet$deviance
print(paste('Stat de test =',LRGlob))
## [1] "Stat de test = 740.111254720694"
# Q.16c - degrés de liberté
ddlGlob <- mNet$edf - mNull$edf
print(paste('degrés de liberté =',ddlGlob))
## [1] "degrés de liberté = 14"
# Q.16c + Q.16d - p-value
print(paste('p-value =',pchisq(LRGlob,ddlGlob,lower.tail=FALSE)))
## [1] "p-value = 7.01830654766149e-149"
# Q.17
# vérifier la pertinence de chaque variable
# via le test du rapport de vraisemblance de nullité des coefs. associés
for (j in 1:ncol(ZTrain)){
  #régression sans la variable à tester
  mTemp <- nnet::multinom(classe ~ ., data = cbind(ZTrain[,-j],DTrain['classe']))
  #Rapport de vraisemblance
  LR <- mTemp$deviance - mNet$deviance
  #degré de liberté
  ddl <- mNet$edf - mTemp$edf
  #p-value
  pval <- pchisq(LR,ddl,lower.tail=FALSE)
  #affichage
  print(paste(colnames(ZTrain)[j],": "))
  print(c(LR,ddl,pval))
}
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 214.223489
## iter  20 value 210.713376
## final  value 210.697144 
## converged
## [1] "V05 : "
## [1] 6.376899e+01 2.000000e+00 1.421478e-14
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 196.794337
## iter  20 value 193.740664
## final  value 193.713567 
## converged
## [1] "V07 : "
## [1] 2.980183e+01 2.000000e+00 3.377645e-07
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 211.492946
## iter  20 value 190.459968
## final  value 190.406889 
## converged
## [1] "V10 : "
## [1] 2.318848e+01 2.000000e+00 9.219050e-06
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 213.658486
## iter  20 value 204.523704
## final  value 204.465619 
## converged
## [1] "V11 : "
## [1] 5.130594e+01 2.000000e+00 7.228651e-12
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 202.867700
## iter  20 value 192.335120
## final  value 191.974427 
## converged
## [1] "V12 : "
## [1] 2.632355e+01 2.000000e+00 1.922706e-06
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 201.525247
## iter  20 value 195.922767
## final  value 195.899833 
## converged
## [1] "V15 : "
## [1] 3.417437e+01 2.000000e+00 3.794293e-08
## # weights:  24 (14 variable)
## initial  value 549.306144 
## iter  10 value 189.567924
## iter  20 value 187.063187
## final  value 187.044944 
## converged
## [1] "V18 : "
## [1] 1.646459e+01 2.000000e+00 2.659256e-04

Prédiction et évaluation sur l’échantillon test

# Q.18
# charger l'échantillon test
DTest <- read.xlsx("waveform.xlsx",sheetIndex=2,stringsAsFactors=TRUE,encoding="UTF-8")

# structure
str(DTest)
## 'data.frame':    5000 obs. of  8 variables:
##  $ V05   : num  0.6 2.3 2.42 1.13 2.66 -1.28 0.84 -1.92 0.53 0.47 ...
##  $ V07   : num  0.85 5.52 4.94 2.37 2.69 1.6 3.67 1.91 0.24 0.86 ...
##  $ V10   : num  0.89 2.22 2.07 4.84 3.53 4.76 5.2 2.02 1.75 4.5 ...
##  $ V11   : num  1.08 2.81 0.51 4.65 4.82 5.55 8.16 3.63 3.92 6.83 ...
##  $ V12   : num  4.2 1.61 1.45 4.05 4.79 4.3 3.29 3.91 5.68 6.94 ...
##  $ V15   : num  4.59 1.88 1.41 1.24 1.73 2.37 0.4 4.89 3.81 1.08 ...
##  $ V18   : num  3.32 1.41 0.62 -1.43 0.13 0.69 0.66 -0.66 1.51 -0.41 ...
##  $ classe: Factor w/ 3 levels "A","B","C": 3 2 1 2 2 3 2 3 3 3 ...
# Q.19 -- résumé stat.
# à comparer avec les stats calculées sur
# l'échantillon d'apprentissage
print(summary(DTest))
##       V05               V07              V10              V11        
##  Min.   :-3.4800   Min.   :-3.320   Min.   :-1.790   Min.   :-1.480  
##  1st Qu.: 0.0375   1st Qu.: 1.110   1st Qu.: 1.880   1st Qu.: 2.040  
##  Median : 1.1200   Median : 2.500   Median : 3.000   Median : 3.170  
##  Mean   : 1.3109   Mean   : 2.662   Mean   : 2.989   Mean   : 3.337  
##  3rd Qu.: 2.5400   3rd Qu.: 4.210   3rd Qu.: 4.080   3rd Qu.: 4.550  
##  Max.   : 6.5000   Max.   : 8.760   Max.   : 7.630   Max.   : 9.060  
##       V12              V15              V18         classe  
##  Min.   :-1.690   Min.   :-2.560   Min.   :-4.080   A:1657  
##  1st Qu.: 1.920   1st Qu.: 1.120   1st Qu.:-0.010   B:1647  
##  Median : 3.000   Median : 2.490   Median : 0.940   C:1696  
##  Mean   : 3.014   Mean   : 2.648   Mean   : 1.001           
##  3rd Qu.: 4.082   3rd Qu.: 4.183   3rd Qu.: 1.960           
##  Max.   : 7.400   Max.   : 8.720   Max.   : 6.200
# Q.20 + Q.21
# centrer et réduire
# /!\ avec les param. (moyennes, écarts-type) 
# calculés sur l'échantillon d'apprentissage
# parce que les individus de l'échantillon test 
# représentent la population de déploiement
# ils doivent être traités individuellement et non pas collectivement
# aucun paramètre ne doit être calculé sur cet échantillon
# cf. utilisation des param. center et scale dans la fonction scale(.)
ZTest <- scale(DTest[-ncol(DTest)],center=attr(ZTrain,'scaled:center'),scale=attr(ZTrain,'scaled:scale'))

# stat. descriptives
print(summary(ZTest))
##       V05                V07                V10                V11           
##  Min.   :-2.80907   Min.   :-3.03572   Min.   :-3.14061   Min.   :-2.969009  
##  1st Qu.:-0.72219   1st Qu.:-0.76347   1st Qu.:-0.70283   1st Qu.:-0.792924  
##  Median :-0.07996   Median :-0.05050   Median : 0.04113   Median :-0.094351  
##  Mean   : 0.03329   Mean   : 0.03249   Mean   : 0.03360   Mean   : 0.008644  
##  3rd Qu.: 0.76250   3rd Qu.: 0.82660   3rd Qu.: 0.75852   3rd Qu.: 0.758774  
##  Max.   : 3.11191   Max.   : 3.16040   Max.   : 3.11659   Max.   : 3.546884  
##       V12                 V15                V18          
##  Min.   :-3.146545   Min.   :-2.68632   Min.   :-3.83402  
##  1st Qu.:-0.738962   1st Qu.:-0.82969   1st Qu.:-0.82847  
##  Median :-0.018687   Median :-0.13849   Median :-0.12693  
##  Mean   :-0.009608   Mean   :-0.05894   Mean   :-0.08216  
##  3rd Qu.: 0.703255   3rd Qu.: 0.71541   3rd Qu.: 0.62631  
##  Max.   : 2.915764   Max.   : 3.00468   Max.   : 3.75740
# Q.22 -- prédiction
predNet <- predict(mNet,newdata=ZTest,type="class")

# Q.23 -- distribution des classes prédites
print(table(predNet))
## predNet
##    A    B    C 
## 1655 1721 1624
# Q.24 - construire la matrice de confusion
mcNet <- table(DTest$classe,predNet)
print(mcNet)
##    predNet
##        A    B    C
##   A 1323  199  135
##   B  143 1379  125
##   C  189  143 1364
# Q.24a - calculer le taux de reconnaissance à partir de la matrice de confusion
accNet <- sum(diag(mcNet))/sum(mcNet)
print(accNet)
## [1] 0.8132
# Q.24b - calculer le taux d'erreur à partir de la matrice de confusion
errNet <- 1 - accNet
print(errNet)
## [1] 0.1868
# Q.24c - calculer les rappels par classe
rappelNet <- diag(mcNet)/rowSums(mcNet)
print(rappelNet)
##         A         B         C 
## 0.7984309 0.8372799 0.8042453
# Q.24d - calculer les précisions par classe
prNet <- diag(mcNet)/colSums(mcNet)
print(prNet)
##         A         B         C 
## 0.7993958 0.8012783 0.8399015
# Q.25
# vérification avec la librairie "caret"
library(caret)
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : ggplot2
# confrontation des résultats -- ok
print(caret::confusionMatrix(data=predNet,reference=DTest$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C
##          A 1323  143  189
##          B  199 1379  143
##          C  135  125 1364
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8132          
##                  95% CI : (0.8021, 0.8239)
##     No Information Rate : 0.3392          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7198          
##                                           
##  Mcnemar's Test P-Value : 0.0002283       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C
## Sensitivity            0.7984   0.8373   0.8042
## Specificity            0.9007   0.8980   0.9213
## Pos Pred Value         0.7994   0.8013   0.8399
## Neg Pred Value         0.9001   0.9183   0.9017
## Prevalence             0.3314   0.3294   0.3392
## Detection Rate         0.2646   0.2758   0.2728
## Detection Prevalence   0.3310   0.3442   0.3248
## Balanced Accuracy      0.8496   0.8676   0.8628

Traitement avzc le package “VGAM”

# Q.26 - chargement de la librairie
library(VGAM)
## Le chargement a nécessité le package : stats4
## Le chargement a nécessité le package : splines
## 
## Attachement du package : 'VGAM'
## L'objet suivant est masqué depuis 'package:caret':
## 
##     predictors
# Q.27 - modélisation
# cf. attentivement le paramétrage
mVg <- VGAM::vglm(classe ~ ., data = cbind(ZTrain,DTrain['classe']), family=multinomial(refLevel = "A"))
print(mVg)
## 
## Call:
## VGAM::vglm(formula = classe ~ ., family = multinomial(refLevel = "A"), 
##     data = cbind(ZTrain, DTrain["classe"]))
## 
## 
## Coefficients:
## (Intercept):1 (Intercept):2         V05:1         V05:2         V07:1 
##   -0.67395734   -0.56426635   -0.28696650   -2.18294744    0.70713336 
##         V07:2         V10:1         V10:2         V11:1         V11:2 
##   -1.22590914    1.34485051    0.81421167    1.79124390    1.39708662 
##         V12:1         V12:2         V15:1         V15:2         V18:1 
##    1.05221109    1.36733030   -1.73263912   -0.07098318   -1.04142264 
##         V18:2 
##   -0.22780723 
## 
## Degrees of Freedom: 1000 Total; 984 Residual
## Residual deviance: 357.6253 
## Log-likelihood: -178.8127 
## 
## This is a multinomial logit model with 3 levels
# Q.28 - affichage détaillé
# on les tests de significativité des coefs.
# mais par équation, très peu exploitable
sVg <- summary(mVg)
print(sVg)
## 
## Call:
## VGAM::vglm(formula = classe ~ ., family = multinomial(refLevel = "A"), 
##     data = cbind(ZTrain, DTrain["classe"]))
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -0.67396    0.37893  -1.779 0.075312 .  
## (Intercept):2 -0.56427    0.37132  -1.520 0.128604    
## V05:1         -0.28697    0.31547  -0.910 0.363009    
## V05:2         -2.18295    0.33580  -6.501 8.00e-11 ***
## V07:1          0.70713    0.32816   2.155 0.031175 *  
## V07:2         -1.22591    0.35475  -3.456 0.000549 ***
## V10:1          1.34485    0.30126   4.464 8.04e-06 ***
## V10:2          0.81421    0.28131   2.894 0.003800 ** 
## V11:1          1.79124    0.30800   5.816 6.04e-09 ***
## V11:2          1.39709    0.29930   4.668 3.04e-06 ***
## V12:1          1.05221    0.32282   3.259 0.001116 ** 
## V12:2          1.36733    0.29883   4.576 4.75e-06 ***
## V15:1         -1.73264    0.35943  -4.821 1.43e-06 ***
## V15:2         -0.07098    0.31482  -0.225 0.821610    
## V18:1         -1.04142    0.27650  -3.766 0.000166 ***
## V18:2         -0.22781    0.23265  -0.979 0.327491    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1])
## 
## Residual deviance: 357.6253 on 984 degrees of freedom
## 
## Log-likelihood: -178.8127 on 984 degrees of freedom
## 
## Number of Fisher scoring iterations: 7 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  1  of the response
# Q.29 - prediction en test
# type = "response" est celui qui nous intéresse
predVglm <- predictvglm(object=mVg,newdata=as.data.frame(ZTest),type="response")

# Q.30 - premières lignes
# on dispose des probas d'affectation aux classes
print(head(predVglm))
##              A            B            C
## 1 0.7366413241 0.0002518254 0.2631068504
## 2 0.8085895247 0.1891902382 0.0022202371
## 3 0.9690689180 0.0305145662 0.0004165158
## 4 0.0015288560 0.9557480685 0.0427230755
## 5 0.0141765521 0.9480264368 0.0377970111
## 6 0.0005085881 0.0794625956 0.9200288163
# Q.31 - conversion en prédiction des classes
predClassVglm <- factor(c("A","B","C")[apply(predVglm,1,which.max)])
print(head(predClassVglm))
## [1] A A A B B C
## Levels: A B C
# Q.32 - dist. des prédictions
print(table(predClassVglm))
## predClassVglm
##    A    B    C 
## 1655 1721 1624
# Q.33 - confrontation avec la classe observée
print(caret::confusionMatrix(data=predClassVglm,reference=DTest$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C
##          A 1323  143  189
##          B  199 1379  143
##          C  135  125 1364
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8132          
##                  95% CI : (0.8021, 0.8239)
##     No Information Rate : 0.3392          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7198          
##                                           
##  Mcnemar's Test P-Value : 0.0002283       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C
## Sensitivity            0.7984   0.8373   0.8042
## Specificity            0.9007   0.8980   0.9213
## Pos Pred Value         0.7994   0.8013   0.8399
## Neg Pred Value         0.9001   0.9183   0.9017
## Prevalence             0.3314   0.3294   0.3392
## Detection Rate         0.2646   0.2758   0.2728
## Detection Prevalence   0.3310   0.3442   0.3248
## Balanced Accuracy      0.8496   0.8676   0.8628

Comparaison avec un arbre de décision

# Q.34 - arbre de décision avec rpart
library(rpart)
# Q.35
# on peut travailler directement avec les variables originelles
# le traitement des données centrées et réduites
# donnerait exactement le même résultat
mArbre <- rpart(classe ~ ., data = DTrain)
print(mArbre)
## n= 500 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 500 327 A (0.34600000 0.31400000 0.34000000)  
##    2) V05>=0.38 323 173 A (0.46439628 0.42414861 0.11145511)  
##      4) V10< 2.775 145  30 A (0.79310345 0.11034483 0.09655172)  
##        8) V18>=-0.235 130  20 A (0.84615385 0.05384615 0.10000000) *
##        9) V18< -0.235 15   6 B (0.33333333 0.60000000 0.06666667) *
##      5) V10>=2.775 178  57 B (0.19662921 0.67977528 0.12359551)  
##       10) V15< 2.115 126  21 B (0.13492063 0.83333333 0.03174603) *
##       11) V15>=2.115 52  34 A (0.34615385 0.30769231 0.34615385)  
##         22) V12< 3.37 27   9 A (0.66666667 0.18518519 0.14814815) *
##         23) V12>=3.37 25  11 C (0.00000000 0.44000000 0.56000000)  
##           46) V18< 0.76 14   4 B (0.00000000 0.71428571 0.28571429) *
##           47) V18>=0.76 11   1 C (0.00000000 0.09090909 0.90909091) *
##    3) V05< 0.38 177  43 C (0.12994350 0.11299435 0.75706215)  
##      6) V11< 1.705 12   2 A (0.83333333 0.00000000 0.16666667) *
##      7) V11>=1.705 165  33 C (0.07878788 0.12121212 0.80000000)  
##       14) V15< 2.435 30  15 B (0.00000000 0.50000000 0.50000000)  
##         28) V05>=-0.26 10   1 B (0.00000000 0.90000000 0.10000000) *
##         29) V05< -0.26 20   6 C (0.00000000 0.30000000 0.70000000) *
##       15) V15>=2.435 135  18 C (0.09629630 0.03703704 0.86666667) *
# Q.37 - affichage graphique
library(rpart.plot)
rpart.plot(mArbre)

# Q.38 - liste des attributs des l'arbre
print(attributes(mArbre))
## $names
##  [1] "frame"               "where"               "call"               
##  [4] "terms"               "cptable"             "method"             
##  [7] "parms"               "control"             "functions"          
## [10] "numresp"             "splits"              "variable.importance"
## [13] "y"                   "ordered"            
## 
## $xlevels
## named list()
## 
## $ylevels
## [1] "A" "B" "C"
## 
## $class
## [1] "rpart"
# structure de l'arbre
print(mArbre$frame[c('var','n','yval')])
##       var   n yval
## 1     V05 500    1
## 2     V10 323    1
## 4     V18 145    1
## 8  <leaf> 130    1
## 9  <leaf>  15    2
## 5     V15 178    2
## 10 <leaf> 126    2
## 11    V12  52    1
## 22 <leaf>  27    1
## 23    V18  25    3
## 46 <leaf>  14    2
## 47 <leaf>  11    3
## 3     V11 177    3
## 6  <leaf>  12    1
## 7     V15 165    3
## 14    V05  30    2
## 28 <leaf>  10    2
## 29 <leaf>  20    3
## 15 <leaf> 135    3
# Q.39
# liste des variables prises en compte dans l'arbre
# il n'y a pas V07, qui est pourtant significative dans la régression
print(setdiff(unique(mArbre$frame$'var'),c('<leaf>')))
## [1] "V05" "V10" "V18" "V15" "V12" "V11"
# Q.40
# détail et variables compétitives dans l'arbre sur chaque noeud
# voir en particulier : << Primary Splits >>
# sur la racine par exemple, on voit que V07 était en 3e position
# dans la compétition pour segmenter le sommet
print(summary(mArbre))
## Call:
## rpart(formula = classe ~ ., data = DTrain)
##   n= 500 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.33944954      0 1.0000000 1.0122324 0.03234630
## 2 0.26299694      1 0.6605505 0.7094801 0.03410190
## 3 0.02446483      2 0.3975535 0.4403670 0.03096517
## 4 0.01834862      5 0.3241590 0.3944954 0.02991914
## 5 0.01223242      6 0.3058104 0.4189602 0.03049867
## 6 0.01000000      9 0.2691131 0.4097859 0.03028727
## 
## Variable importance
## V05 V10 V15 V11 V18 V07 V12 
##  22  17  17  13  12  11   9 
## 
## Node number 1: 500 observations,    complexity param=0.3394495
##   predicted class=A  expected loss=0.654  P(node) =1
##     class counts:   173   157   170
##    probabilities: 0.346 0.314 0.340 
##   left son=2 (323 obs) right son=3 (177 obs)
##   Primary splits:
##       V05 < 0.38   to the right, improve=71.51910, (0 missing)
##       V15 < 1.93   to the left,  improve=69.90867, (0 missing)
##       V07 < 2.58   to the right, improve=61.33074, (0 missing)
##       V11 < 2.92   to the left,  improve=52.97582, (0 missing)
##       V10 < 2.775  to the left,  improve=50.01740, (0 missing)
##   Surrogate splits:
##       V07 < 2.3    to the right, agree=0.734, adj=0.249, (0 split)
##       V12 < 3.5    to the left,  agree=0.722, adj=0.215, (0 split)
##       V15 < 2.96   to the left,  agree=0.700, adj=0.153, (0 split)
##       V11 < 5.985  to the left,  agree=0.670, adj=0.068, (0 split)
##       V18 < 1.875  to the left,  agree=0.664, adj=0.051, (0 split)
## 
## Node number 2: 323 observations,    complexity param=0.2629969
##   predicted class=A  expected loss=0.5356037  P(node) =0.646
##     class counts:   150   137    36
##    probabilities: 0.464 0.424 0.111 
##   left son=4 (145 obs) right son=5 (178 obs)
##   Primary splits:
##       V10 < 2.775  to the left,  improve=54.39788, (0 missing)
##       V15 < 2.205  to the right, improve=47.54085, (0 missing)
##       V11 < 3.45   to the left,  improve=41.59162, (0 missing)
##       V18 < 0.795  to the right, improve=37.19672, (0 missing)
##       V12 < 3.365  to the left,  improve=18.48477, (0 missing)
##   Surrogate splits:
##       V15 < 2.535  to the right, agree=0.743, adj=0.428, (0 split)
##       V11 < 2.885  to the left,  agree=0.721, adj=0.379, (0 split)
##       V18 < 0.905  to the right, agree=0.700, adj=0.331, (0 split)
##       V07 < 1.355  to the left,  agree=0.619, adj=0.152, (0 split)
##       V12 < 2.765  to the left,  agree=0.588, adj=0.083, (0 split)
## 
## Node number 3: 177 observations,    complexity param=0.02446483
##   predicted class=C  expected loss=0.2429379  P(node) =0.354
##     class counts:    23    20   134
##    probabilities: 0.130 0.113 0.757 
##   left son=6 (12 obs) right son=7 (165 obs)
##   Primary splits:
##       V11 < 1.705  to the left,  improve=11.020240, (0 missing)
##       V12 < 2.855  to the left,  improve= 8.911673, (0 missing)
##       V15 < 2.435  to the left,  improve= 8.332924, (0 missing)
##       V07 < 2.83   to the right, improve= 7.413908, (0 missing)
##       V18 < -0.045 to the left,  improve= 6.303253, (0 missing)
##   Surrogate splits:
##       V10 < -0.4   to the left,  agree=0.949, adj=0.250, (0 split)
##       V18 < 4.205  to the right, agree=0.938, adj=0.083, (0 split)
## 
## Node number 4: 145 observations,    complexity param=0.01223242
##   predicted class=A  expected loss=0.2068966  P(node) =0.29
##     class counts:   115    16    14
##    probabilities: 0.793 0.110 0.097 
##   left son=8 (130 obs) right son=9 (15 obs)
##   Primary splits:
##       V18 < -0.235 to the right, improve=7.563042, (0 missing)
##       V15 < 1.335  to the right, improve=6.238181, (0 missing)
##       V11 < 3.96   to the left,  improve=6.221483, (0 missing)
##       V07 < 0.135  to the right, improve=4.691366, (0 missing)
##       V12 < 3.505  to the left,  improve=3.521881, (0 missing)
##   Surrogate splits:
##       V15 < -0.065 to the right, agree=0.917, adj=0.200, (0 split)
##       V05 < 4.34   to the left,  agree=0.903, adj=0.067, (0 split)
##       V11 < 4.46   to the left,  agree=0.903, adj=0.067, (0 split)
## 
## Node number 5: 178 observations,    complexity param=0.02446483
##   predicted class=B  expected loss=0.3202247  P(node) =0.356
##     class counts:    35   121    22
##    probabilities: 0.197 0.680 0.124 
##   left son=10 (126 obs) right son=11 (52 obs)
##   Primary splits:
##       V15 < 2.115  to the left,  improve=15.451320, (0 missing)
##       V07 < 2.475  to the right, improve=11.976110, (0 missing)
##       V11 < 2.745  to the left,  improve=11.261400, (0 missing)
##       V18 < 0.86   to the right, improve=10.659630, (0 missing)
##       V12 < 1.4    to the left,  improve= 6.268115, (0 missing)
##   Surrogate splits:
##       V07 < 2.21   to the right, agree=0.820, adj=0.385, (0 split)
##       V18 < 1.06   to the left,  agree=0.764, adj=0.192, (0 split)
##       V05 < 0.695  to the right, agree=0.736, adj=0.096, (0 split)
##       V12 < 4.845  to the left,  agree=0.725, adj=0.058, (0 split)
##       V10 < 2.79   to the right, agree=0.713, adj=0.019, (0 split)
## 
## Node number 6: 12 observations
##   predicted class=A  expected loss=0.1666667  P(node) =0.024
##     class counts:    10     0     2
##    probabilities: 0.833 0.000 0.167 
## 
## Node number 7: 165 observations,    complexity param=0.01223242
##   predicted class=C  expected loss=0.2  P(node) =0.33
##     class counts:    13    20   132
##    probabilities: 0.079 0.121 0.800 
##   left son=14 (30 obs) right son=15 (135 obs)
##   Primary splits:
##       V15 < 2.435  to the left,  improve=8.788552, (0 missing)
##       V07 < 2.83   to the right, improve=7.929258, (0 missing)
##       V18 < -0.045 to the left,  improve=6.572547, (0 missing)
##       V10 < 3.925  to the right, improve=6.372385, (0 missing)
##       V11 < 5.415  to the right, improve=4.838432, (0 missing)
##   Surrogate splits:
##       V10 < 4.82   to the right, agree=0.830, adj=0.067, (0 split)
##       V11 < 6.86   to the right, agree=0.830, adj=0.067, (0 split)
##       V18 < -1.355 to the left,  agree=0.824, adj=0.033, (0 split)
## 
## Node number 8: 130 observations
##   predicted class=A  expected loss=0.1538462  P(node) =0.26
##     class counts:   110     7    13
##    probabilities: 0.846 0.054 0.100 
## 
## Node number 9: 15 observations
##   predicted class=B  expected loss=0.4  P(node) =0.03
##     class counts:     5     9     1
##    probabilities: 0.333 0.600 0.067 
## 
## Node number 10: 126 observations
##   predicted class=B  expected loss=0.1666667  P(node) =0.252
##     class counts:    17   105     4
##    probabilities: 0.135 0.833 0.032 
## 
## Node number 11: 52 observations,    complexity param=0.02446483
##   predicted class=A  expected loss=0.6538462  P(node) =0.104
##     class counts:    18    16    18
##    probabilities: 0.346 0.308 0.346 
##   left son=22 (27 obs) right son=23 (25 obs)
##   Primary splits:
##       V12 < 3.37   to the left,  improve=8.813903, (0 missing)
##       V11 < 3.37   to the left,  improve=7.556914, (0 missing)
##       V18 < 1.125  to the right, improve=6.325014, (0 missing)
##       V07 < 1.46   to the right, improve=5.961790, (0 missing)
##       V05 < 1.57   to the right, improve=5.602051, (0 missing)
##   Surrogate splits:
##       V11 < 3.705  to the left,  agree=0.865, adj=0.72, (0 split)
##       V05 < 1.125  to the right, agree=0.731, adj=0.44, (0 split)
##       V07 < 1.89   to the right, agree=0.712, adj=0.40, (0 split)
##       V18 < -0.085 to the right, agree=0.712, adj=0.40, (0 split)
##       V10 < 3.855  to the left,  agree=0.692, adj=0.36, (0 split)
## 
## Node number 14: 30 observations,    complexity param=0.01223242
##   predicted class=B  expected loss=0.5  P(node) =0.06
##     class counts:     0    15    15
##    probabilities: 0.000 0.500 0.500 
##   left son=28 (10 obs) right son=29 (20 obs)
##   Primary splits:
##       V05 < -0.26  to the right, improve=4.800000, (0 missing)
##       V07 < 2.83   to the right, improve=4.565217, (0 missing)
##       V11 < 5.275  to the right, improve=3.516746, (0 missing)
##       V18 < 0.24   to the left,  improve=3.516746, (0 missing)
##       V10 < 3.77   to the right, improve=1.984127, (0 missing)
##   Surrogate splits:
##       V12 < 3.915  to the left,  agree=0.800, adj=0.4, (0 split)
##       V11 < 5.63   to the right, agree=0.733, adj=0.2, (0 split)
##       V18 < -0.175 to the left,  agree=0.733, adj=0.2, (0 split)
##       V07 < 2.73   to the right, agree=0.700, adj=0.1, (0 split)
##       V15 < 1.395  to the left,  agree=0.700, adj=0.1, (0 split)
## 
## Node number 15: 135 observations
##   predicted class=C  expected loss=0.1333333  P(node) =0.27
##     class counts:    13     5   117
##    probabilities: 0.096 0.037 0.867 
## 
## Node number 22: 27 observations
##   predicted class=A  expected loss=0.3333333  P(node) =0.054
##     class counts:    18     5     4
##    probabilities: 0.667 0.185 0.148 
## 
## Node number 23: 25 observations,    complexity param=0.01834862
##   predicted class=C  expected loss=0.44  P(node) =0.05
##     class counts:     0    11    14
##    probabilities: 0.000 0.440 0.560 
##   left son=46 (14 obs) right son=47 (11 obs)
##   Primary splits:
##       V18 < 0.76   to the left,  improve=4.787532, (0 missing)
##       V15 < 3.245  to the left,  improve=3.853333, (0 missing)
##       V12 < 3.975  to the left,  improve=3.383492, (0 missing)
##       V07 < 1.56   to the right, improve=2.371282, (0 missing)
##       V05 < 0.65   to the left,  improve=1.462857, (0 missing)
##   Surrogate splits:
##       V07 < 0.85   to the right, agree=0.84, adj=0.636, (0 split)
##       V10 < 3.76   to the right, agree=0.72, adj=0.364, (0 split)
##       V11 < 5.085  to the left,  agree=0.72, adj=0.364, (0 split)
##       V12 < 4.11   to the left,  agree=0.72, adj=0.364, (0 split)
##       V15 < 3.89   to the left,  agree=0.72, adj=0.364, (0 split)
## 
## Node number 28: 10 observations
##   predicted class=B  expected loss=0.1  P(node) =0.02
##     class counts:     0     9     1
##    probabilities: 0.000 0.900 0.100 
## 
## Node number 29: 20 observations
##   predicted class=C  expected loss=0.3  P(node) =0.04
##     class counts:     0     6    14
##    probabilities: 0.000 0.300 0.700 
## 
## Node number 46: 14 observations
##   predicted class=B  expected loss=0.2857143  P(node) =0.028
##     class counts:     0    10     4
##    probabilities: 0.000 0.714 0.286 
## 
## Node number 47: 11 observations
##   predicted class=C  expected loss=0.09090909  P(node) =0.022
##     class counts:     0     1    10
##    probabilities: 0.000 0.091 0.909 
## 
## n= 500 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 500 327 A (0.34600000 0.31400000 0.34000000)  
##    2) V05>=0.38 323 173 A (0.46439628 0.42414861 0.11145511)  
##      4) V10< 2.775 145  30 A (0.79310345 0.11034483 0.09655172)  
##        8) V18>=-0.235 130  20 A (0.84615385 0.05384615 0.10000000) *
##        9) V18< -0.235 15   6 B (0.33333333 0.60000000 0.06666667) *
##      5) V10>=2.775 178  57 B (0.19662921 0.67977528 0.12359551)  
##       10) V15< 2.115 126  21 B (0.13492063 0.83333333 0.03174603) *
##       11) V15>=2.115 52  34 A (0.34615385 0.30769231 0.34615385)  
##         22) V12< 3.37 27   9 A (0.66666667 0.18518519 0.14814815) *
##         23) V12>=3.37 25  11 C (0.00000000 0.44000000 0.56000000)  
##           46) V18< 0.76 14   4 B (0.00000000 0.71428571 0.28571429) *
##           47) V18>=0.76 11   1 C (0.00000000 0.09090909 0.90909091) *
##    3) V05< 0.38 177  43 C (0.12994350 0.11299435 0.75706215)  
##      6) V11< 1.705 12   2 A (0.83333333 0.00000000 0.16666667) *
##      7) V11>=1.705 165  33 C (0.07878788 0.12121212 0.80000000)  
##       14) V15< 2.435 30  15 B (0.00000000 0.50000000 0.50000000)  
##         28) V05>=-0.26 10   1 B (0.00000000 0.90000000 0.10000000) *
##         29) V05< -0.26 20   6 C (0.00000000 0.30000000 0.70000000) *
##       15) V15>=2.435 135  18 C (0.09629630 0.03703704 0.86666667) *
# Q.41 - prédiction
predArbre <- predict(mArbre,newdata=DTest,type="class")
print(table(predArbre))
## predArbre
##    A    B    C 
## 1735 1855 1410
# Q.42 - performances - nettement moins bon que la régression logistique
# sur ces données
print(caret::confusionMatrix(data=predArbre,reference=DTest$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C
##          A 1156  203  376
##          B  346 1283  226
##          C  155  161 1094
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7066          
##                  95% CI : (0.6938, 0.7192)
##     No Information Rate : 0.3392          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5602          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C
## Sensitivity            0.6976   0.7790   0.6450
## Specificity            0.8268   0.8294   0.9044
## Pos Pred Value         0.6663   0.6916   0.7759
## Neg Pred Value         0.8466   0.8843   0.8323
## Prevalence             0.3314   0.3294   0.3392
## Detection Rate         0.2312   0.2566   0.2188
## Detection Prevalence   0.3470   0.3710   0.2820
## Balanced Accuracy      0.7622   0.8042   0.7747