#préambule - fonction de calcul taux d'erreur
#modele est un objet issu de glm()
#donnees est un data frame des données "diabete"
calculer_taux_erreur <- function(modele,donnees){
  #obtenir la proba d'être positif
  pplus <- predict(modele,newdata=donnees,type="response")
  #obtenir la prédiction
  prediction <- as.factor(ifelse(pplus > 0.5,"positive","negative"))
  #calculer la matrice de confusion
  mc <- table(donnees.test$diabete,prediction)
  cat("Matrice de confusion :\n")
  print(mc)
  #calculer le taux d'erreur
  err.rate <- (mc[2,1]+mc[1,2])/sum(mc)
  cat("\nTaux d'erreur : ")
  cat(err.rate)
}

Importation et inspection des données

# Q.1 - charger le package "xlsx"
library(xlsx) #lecture directe des fichiers EXCEL
# Q.2 - charger les données apprentissage, attention aux options, notamment stringsAsFactors
donnees.app <- xlsx::read.xlsx(file="diabete_reg_logistique.xlsx",sheetIndex="apprentissage",stringsAsFactors=TRUE)
# Q.3 -- caractéristiques
str(donnees.app)
## 'data.frame':    568 obs. of  11 variables:
##  $ pregnant : num  0 4 3 3 5 2 2 9 1 2 ...
##  $ plasma   : num  138 142 142 113 88 110 129 57 79 99 ...
##  $ diastolic: num  0 86 80 50 78 74 84 80 60 70 ...
##  $ triceps  : num  0 0 15 10 30 29 0 37 42 16 ...
##  $ serum    : num  0 0 0 85 0 125 0 0 48 44 ...
##  $ bodymass : num  36.3 44 32.4 29.5 27.6 32.4 28 32.8 43.5 20.4 ...
##  $ pedigree : num  0.933 0.645 0.2 0.626 0.258 0.698 0.284 0.096 0.678 0.235 ...
##  $ age      : num  25 22 63 25 37 27 27 41 23 27 ...
##  $ alea1    : num  0.338 0.835 0.493 0.857 0.045 0.382 0.992 0.618 0.736 0.478 ...
##  $ alea2    : num  0.188 0.711 0.845 0.821 0.392 0.721 0.224 0.478 0.371 0.79 ...
##  $ diabete  : Factor w/ 2 levels "negative","positive": 2 2 1 1 1 1 1 1 1 1 ...
# Q.4 -- summary
summary(donnees.app)
##     pregnant          plasma        diastolic         triceps    
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.0  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.: 0.0  
##  Median : 3.000   Median :115.0   Median : 72.00   Median :23.0  
##  Mean   : 3.778   Mean   :120.3   Mean   : 69.27   Mean   :20.9  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:33.0  
##  Max.   :15.000   Max.   :198.0   Max.   :122.00   Max.   :99.0  
##      serum           bodymass        pedigree           age       
##  Min.   :  0.00   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.00   1st Qu.:27.18   1st Qu.:0.2380   1st Qu.:24.00  
##  Median : 39.00   Median :31.75   Median :0.3690   Median :29.00  
##  Mean   : 81.96   Mean   :31.88   Mean   :0.4612   Mean   :33.23  
##  3rd Qu.:125.25   3rd Qu.:36.50   3rd Qu.:0.6085   3rd Qu.:40.00  
##  Max.   :846.00   Max.   :55.00   Max.   :2.3290   Max.   :81.00  
##      alea1            alea2           diabete   
##  Min.   :0.0080   Min.   :0.002   negative:372  
##  1st Qu.:0.2427   1st Qu.:0.281   positive:196  
##  Median :0.4845   Median :0.513                 
##  Mean   :0.4952   Mean   :0.507                 
##  3rd Qu.:0.7388   3rd Qu.:0.743                 
##  Max.   :0.9980   Max.   :0.999
#données test (Q.2 + 3 + 4)
donnees.test <- read.xlsx(file="diabete_reg_logistique.xlsx",sheetIndex="test",stringsAsFactors=TRUE)

#caractéristiques
str(donnees.test)
## 'data.frame':    200 obs. of  11 variables:
##  $ pregnant : num  4 8 0 3 0 11 2 1 4 1 ...
##  $ plasma   : num  132 74 93 108 137 136 93 181 83 114 ...
##  $ diastolic: num  0 70 60 62 68 84 64 78 86 66 ...
##  $ triceps  : num  0 40 25 24 14 35 32 42 19 36 ...
##  $ serum    : num  0 49 92 0 148 130 160 293 0 200 ...
##  $ bodymass : num  32.9 35.3 28.7 26 24.8 28.3 38 40 29.3 38.1 ...
##  $ pedigree : num  0.302 0.705 0.532 0.223 0.143 ...
##  $ age      : num  23 39 22 25 21 42 23 22 34 21 ...
##  $ alea1    : num  0.714 0.749 0.868 0.144 0.544 0.06 0.897 0.606 0.031 0.14 ...
##  $ alea2    : num  0.695 0.881 0.178 0.756 0.166 0.301 0.987 0.933 0.538 0.456 ...
##  $ diabete  : Factor w/ 2 levels "negative","positive": 2 1 1 1 1 2 2 2 1 1 ...
#summary
print(summary(donnees.test))
##     pregnant          plasma        diastolic         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.:102.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :120.0   Median : 72.00   Median :22.00  
##  Mean   : 4.035   Mean   :122.6   Mean   : 68.62   Mean   :19.49  
##  3rd Qu.: 6.000   3rd Qu.:139.5   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :110.00   Max.   :63.00  
##      serum           bodymass        pedigree           age       
##  Min.   :  0.00   Min.   : 0.00   Min.   :0.1000   Min.   :21.00  
##  1st Qu.:  0.00   1st Qu.:27.77   1st Qu.:0.2510   1st Qu.:24.00  
##  Median :  0.00   Median :32.80   Median :0.3820   Median :29.00  
##  Mean   : 73.67   Mean   :32.32   Mean   :0.5023   Mean   :33.27  
##  3rd Qu.:131.25   3rd Qu.:36.90   3rd Qu.:0.6673   3rd Qu.:41.25  
##  Max.   :543.00   Max.   :67.10   Max.   :2.4200   Max.   :72.00  
##      alea1            alea2            diabete   
##  Min.   :0.0000   Min.   :0.0060   negative:128  
##  1st Qu.:0.2020   1st Qu.:0.2218   positive: 72  
##  Median :0.5040   Median :0.4490                 
##  Mean   :0.5038   Mean   :0.4698                 
##  3rd Qu.:0.7792   3rd Qu.:0.6955                 
##  Max.   :1.0000   Max.   :0.9870

Modélisation et évaluation [statistique] du modèle (1)

# Q.5 + 6 - modélisation avec la régression logistique - diabete vs. les autres variables
# cf. option "family"
modele <- glm(diabete ~ ., data = donnees.app, family = binomial)
print(modele)
## 
## Call:  glm(formula = diabete ~ ., family = binomial, data = donnees.app)
## 
## Coefficients:
## (Intercept)     pregnant       plasma    diastolic      triceps        serum  
##  -8.1901884    0.0947479    0.0321814   -0.0164547    0.0029843   -0.0009302  
##    bodymass     pedigree          age        alea1        alea2  
##   0.0946971    0.7916786    0.0227797   -0.2241445    0.3815692  
## 
## Degrees of Freedom: 567 Total (i.e. Null);  557 Residual
## Null Deviance:       732 
## Residual Deviance: 540.9     AIC: 562.9
# Q.7 - attributs de l'objet modele
print(attributes(modele))
## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"          
## 
## $class
## [1] "glm" "lm"
# Q.8 -- R2 McFadden
LLa <- modele$deviance/(-2)
LL0 <- modele$null.deviance/(-2)
R2MF <- 1.0 - LLa/LL0
print(R2MF)
## [1] 0.261014
#R2 Cox-Snell
La <- exp(LLa)
L0 <- exp(LL0)
R2CS <- 1.0 - (L0/La)^(2/nrow(donnees.app))
print(R2CS)
## [1] 0.2856372
#R2 Nagelkerke
R2N <- R2CS/(1.0 - L0^(2/nrow(donnees.app)))
print(R2N)
## [1] 0.3943269
# Q.9 - test de significativé globale
#chi-2 du rapport de vraisemblance
chi2 <- modele$null.deviance - modele$deviance
print(chi2)
## [1] 191.0549
#ddl - degré de liberté
ddl <- modele$df.null - modele$df.residual
print(ddl)
## [1] 10
#p-value
pvalue <- pchisq(chi2,ddl,lower.tail = FALSE)
print(pvalue)
## [1] 1.17931e-35

Evaluation [prédictive] du modèle (2)

# Q.10 - prédiction
# "response" qui revient à calculer le score d'affectation à la 2nd modalité (positive)
# si link, on aurait eu le logit c.-à-d. la combinaison linéaire + la constante
proba <- predict(modele, newdata = donnees.test, type = "response")

#affichage pour les premiers individus
head(proba)
##          1          2          3          4          5          6 
## 0.60359199 0.23663992 0.06403929 0.12846841 0.10767907 0.44372734
# Q.11 - prédiction
# conversion des probabilités d'affectation en classes prédites
pred <- factor(ifelse(proba > 0.5, "positive", "negative"))

#distribution des classes prédites
print(table(pred))
## pred
## negative positive 
##      139       61
# Q.12 - matrice de confusion : classes observées vs. classes prédites
mc <- table(donnees.test$diabete,pred)
print(mc)
##           pred
##            negative positive
##   negative      115       13
##   positive       24       48
#taux d'erreur
err <- 1.0-sum(diag(mc))/sum(mc)
print(err)
## [1] 0.185
#appel de la fonction pour le calcul du taux d'erreur
calculer_taux_erreur(modele,donnees.test)
## Matrice de confusion :
##           prediction
##            negative positive
##   negative      115       13
##   positive       24       48
## 
## Taux d'erreur : 0.185

Evaluation des variables

# Q.13 + 14 - test de Wald pour l'évaluation des variables
res <- summary(modele)
print(res)
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = donnees.app)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4778  -0.7382  -0.4222   0.7302   2.7998  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.1901884  0.8869146  -9.234  < 2e-16 ***
## pregnant     0.0947479  0.0384014   2.467   0.0136 *  
## plasma       0.0321814  0.0041123   7.826 5.05e-15 ***
## diastolic   -0.0164547  0.0063177  -2.605   0.0092 ** 
## triceps      0.0029843  0.0078995   0.378   0.7056    
## serum       -0.0009302  0.0009879  -0.942   0.3464    
## bodymass     0.0946971  0.0177019   5.350 8.82e-08 ***
## pedigree     0.7916786  0.3560964   2.223   0.0262 *  
## age          0.0227797  0.0112745   2.020   0.0433 *  
## alea1       -0.2241445  0.3727308  -0.601   0.5476    
## alea2        0.3815692  0.3781735   1.009   0.3130    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731.97  on 567  degrees of freedom
## Residual deviance: 540.92  on 557  degrees of freedom
## AIC: 562.92
## 
## Number of Fisher Scoring iterations: 5

Sélection de variables “backward” - Critère AIC

# Q.15 - librairie MASS pour stepAIC
library(MASS)

#processus de sélection backward
modele.backward_AIC <- stepAIC(modele, data = donnees.app, direction="backward")
## Start:  AIC=562.92
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass + 
##     pedigree + age + alea1 + alea2
## 
##             Df Deviance    AIC
## - triceps    1   541.06 561.06
## - alea1      1   541.28 561.28
## - serum      1   541.80 561.80
## - alea2      1   541.94 561.94
## <none>           540.92 562.92
## - age        1   545.01 565.01
## - pedigree   1   546.01 566.01
## - pregnant   1   547.13 567.13
## - diastolic  1   547.94 567.94
## - bodymass   1   573.95 593.95
## - plasma     1   617.52 637.52
## 
## Step:  AIC=561.06
## diabete ~ pregnant + plasma + diastolic + serum + bodymass + 
##     pedigree + age + alea1 + alea2
## 
##             Df Deviance    AIC
## - alea1      1   541.45 559.45
## - serum      1   541.80 559.80
## - alea2      1   542.10 560.10
## <none>           541.06 561.06
## - age        1   545.03 563.03
## - pedigree   1   546.37 564.37
## - pregnant   1   547.40 565.40
## - diastolic  1   547.95 565.95
## - bodymass   1   578.96 596.96
## - plasma     1   618.07 636.07
## 
## Step:  AIC=559.45
## diabete ~ pregnant + plasma + diastolic + serum + bodymass + 
##     pedigree + age + alea2
## 
##             Df Deviance    AIC
## - serum      1   542.20 558.20
## - alea2      1   542.56 558.56
## <none>           541.45 559.45
## - age        1   545.46 561.46
## - pedigree   1   546.91 562.91
## - pregnant   1   547.68 563.68
## - diastolic  1   548.49 564.49
## - bodymass   1   579.80 595.80
## - plasma     1   618.56 634.56
## 
## Step:  AIC=558.2
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree + 
##     age + alea2
## 
##             Df Deviance    AIC
## - alea2      1   543.50 557.50
## <none>           542.20 558.20
## - age        1   546.54 560.54
## - pedigree   1   547.24 561.24
## - pregnant   1   548.61 562.61
## - diastolic  1   549.34 563.34
## - bodymass   1   579.80 593.80
## - plasma     1   622.73 636.73
## 
## Step:  AIC=557.5
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree + 
##     age
## 
##             Df Deviance    AIC
## <none>           543.50 557.50
## - age        1   547.93 559.93
## - pedigree   1   548.53 560.53
## - pregnant   1   549.99 561.99
## - diastolic  1   550.82 562.82
## - bodymass   1   581.16 593.16
## - plasma     1   624.35 636.35
#caractéristiques du modèle final
summary(modele.backward_AIC)
## 
## Call:
## glm(formula = diabete ~ pregnant + plasma + diastolic + bodymass + 
##     pedigree + age, family = binomial, data = donnees.app)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5974  -0.7343  -0.4326   0.7434   2.7641  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.980598   0.819010  -9.744  < 2e-16 ***
## pregnant     0.095927   0.038057   2.521  0.01172 *  
## plasma       0.030872   0.003829   8.062 7.49e-16 ***
## diastolic   -0.016339   0.006154  -2.655  0.00794 ** 
## bodymass     0.094578   0.016654   5.679 1.35e-08 ***
## pedigree     0.773523   0.349523   2.213  0.02689 *  
## age          0.023376   0.011110   2.104  0.03538 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731.97  on 567  degrees of freedom
## Residual deviance: 543.50  on 561  degrees of freedom
## AIC: 557.5
## 
## Number of Fisher Scoring iterations: 5
# Q.16 - performances prédictives du modèle définitif
calculer_taux_erreur(modele.backward_AIC,donnees.test)
## Matrice de confusion :
##           prediction
##            negative positive
##   negative      112       16
##   positive       25       47
## 
## Taux d'erreur : 0.205

Sélection de variables “backward” - Critère BIC

#selection backward - BIC - cf. le paramètre "k"
modele.backward_BIC <- stepAIC(modele, data = donnees.app, direction="backward", k = log(nrow(donnees.app)))
## Start:  AIC=610.68
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass + 
##     pedigree + age + alea1 + alea2
## 
##             Df Deviance    AIC
## - triceps    1   541.06 604.48
## - alea1      1   541.28 604.70
## - serum      1   541.80 605.22
## - alea2      1   541.94 605.36
## - age        1   545.01 608.44
## - pedigree   1   546.01 609.43
## - pregnant   1   547.13 610.55
## <none>           540.92 610.68
## - diastolic  1   547.94 611.36
## - bodymass   1   573.95 637.38
## - plasma     1   617.52 680.94
## 
## Step:  AIC=604.48
## diabete ~ pregnant + plasma + diastolic + serum + bodymass + 
##     pedigree + age + alea1 + alea2
## 
##             Df Deviance    AIC
## - alea1      1   541.45 598.53
## - serum      1   541.80 598.88
## - alea2      1   542.10 599.17
## - age        1   545.03 602.11
## - pedigree   1   546.37 603.45
## - pregnant   1   547.40 604.48
## <none>           541.06 604.48
## - diastolic  1   547.95 605.03
## - bodymass   1   578.96 636.04
## - plasma     1   618.07 675.15
## 
## Step:  AIC=598.53
## diabete ~ pregnant + plasma + diastolic + serum + bodymass + 
##     pedigree + age + alea2
## 
##             Df Deviance    AIC
## - serum      1   542.20 592.94
## - alea2      1   542.56 593.30
## - age        1   545.46 596.20
## - pedigree   1   546.91 597.65
## - pregnant   1   547.68 598.42
## <none>           541.45 598.53
## - diastolic  1   548.49 599.22
## - bodymass   1   579.80 630.54
## - plasma     1   618.56 669.30
## 
## Step:  AIC=592.94
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree + 
##     age + alea2
## 
##             Df Deviance    AIC
## - alea2      1   543.50 587.90
## - age        1   546.54 590.94
## - pedigree   1   547.24 591.63
## <none>           542.20 592.94
## - pregnant   1   548.61 593.00
## - diastolic  1   549.34 593.73
## - bodymass   1   579.80 624.20
## - plasma     1   622.73 667.12
## 
## Step:  AIC=587.9
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree + 
##     age
## 
##             Df Deviance    AIC
## - age        1   547.93 585.98
## - pedigree   1   548.53 586.58
## <none>           543.50 587.90
## - pregnant   1   549.99 588.05
## - diastolic  1   550.82 588.87
## - bodymass   1   581.16 619.21
## - plasma     1   624.35 662.41
## 
## Step:  AIC=585.98
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree
## 
##             Df Deviance    AIC
## - pedigree   1   553.22 584.93
## - diastolic  1   553.51 585.22
## <none>           547.93 585.98
## - pregnant   1   567.89 599.60
## - bodymass   1   582.52 614.24
## - plasma     1   640.54 672.26
## 
## Step:  AIC=584.93
## diabete ~ pregnant + plasma + diastolic + bodymass
## 
##             Df Deviance    AIC
## - diastolic  1   558.42 583.79
## <none>           553.22 584.93
## - pregnant   1   571.74 597.11
## - bodymass   1   588.67 614.04
## - plasma     1   649.73 675.10
## 
## Step:  AIC=583.79
## diabete ~ pregnant + plasma + bodymass
## 
##            Df Deviance    AIC
## <none>          558.42 583.79
## - pregnant  1   574.53 593.56
## - bodymass  1   590.69 609.72
## - plasma    1   652.06 671.09
#carcatéristiques du modèle définitif
summary(modele.backward_BIC)
## 
## Call:
## glm(formula = diabete ~ pregnant + plasma + bodymass, family = binomial, 
##     data = donnees.app)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1344  -0.7399  -0.4508   0.7968   2.7796  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.879901   0.727777 -10.827  < 2e-16 ***
## pregnant     0.122258   0.030885   3.958 7.54e-05 ***
## plasma       0.032016   0.003716   8.617  < 2e-16 ***
## bodymass     0.084730   0.015931   5.319 1.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731.97  on 567  degrees of freedom
## Residual deviance: 558.42  on 564  degrees of freedom
## AIC: 566.42
## 
## Number of Fisher Scoring iterations: 5
# Q.18 - performances en test
calculer_taux_erreur(modele.backward_BIC,donnees.test)
## Matrice de confusion :
##           prediction
##            negative positive
##   negative      115       13
##   positive       28       44
## 
## Taux d'erreur : 0.205
# Q.19 - propriétés de l'objet issu de la sélection
print(attributes(modele.backward_BIC))
## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"          
## [31] "anova"            
## 
## $class
## [1] "glm" "lm"
# Q.20 + 21 - le champ ANOVA
print(modele.backward_BIC$anova)
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass + 
##     pedigree + age + alea1 + alea2
## 
## Final Model:
## diabete ~ pregnant + plasma + bodymass
## 
## 
##          Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                                557   540.9170 610.6804
## 2   - triceps  1 0.1430769       558   541.0601 604.4813
## 3     - alea1  1 0.3915816       559   541.4517 598.5308
## 4     - serum  1 0.7517023       560   542.2034 592.9404
## 5     - alea2  1 1.2977440       561   543.5011 587.8960
## 6       - age  1 4.4277935       562   547.9289 585.9817
## 7  - pedigree  1 5.2896430       563   553.2186 584.9292
## 8 - diastolic  1 5.2006549       564   558.4192 583.7877
# Q.22 - graphique, évolution du critère BIC
# en fonction du nombre de variables retirées
plot(0:(nrow(modele.backward_BIC$anova)-1),modele.backward_BIC$anova[,"AIC"],type="b",xlab="# de var. retirées",ylab="BIC",main="Sélection backward (BIC)")

Sélection “forward” - Critère BIC

# Q.23 - sélection forward, critère BIC

#modèle initial : modèle réduit à la constante
modele.initial <- glm(diabete ~ 1, data = donnees.app, family = binomial)
print(modele.initial)
## 
## Call:  glm(formula = diabete ~ 1, family = binomial, data = donnees.app)
## 
## Coefficients:
## (Intercept)  
##     -0.6408  
## 
## Degrees of Freedom: 567 Total (i.e. Null);  567 Residual
## Null Deviance:       732 
## Residual Deviance: 732   AIC: 734
# sélection forward - k = critère BIC
# cf. le rôle du paramètre "scope"
modele.forward <- stepAIC(modele.initial, data = donnees.app, scope = list(lower="~1",upper=as.formula(modele)),direction="forward",k=log(nrow(donnees.app)))
## Start:  AIC=738.31
## diabete ~ 1
## 
##             Df Deviance    AIC
## + plasma     1   604.67 617.35
## + bodymass   1   677.00 689.69
## + age        1   701.29 713.98
## + pregnant   1   707.58 720.26
## + serum      1   721.15 733.84
## + pedigree   1   722.67 735.35
## <none>           731.97 738.31
## + triceps    1   727.35 740.03
## + alea2      1   729.48 742.16
## + alea1      1   730.57 743.26
## + diastolic  1   731.47 744.15
## 
## Step:  AIC=617.35
## diabete ~ plasma
## 
##             Df Deviance    AIC
## + bodymass   1   574.53 593.56
## + pregnant   1   590.69 609.72
## + age        1   595.92 614.94
## <none>           604.67 617.35
## + pedigree   1   600.12 619.14
## + triceps    1   602.17 621.20
## + alea2      1   602.88 621.91
## + alea1      1   603.63 622.66
## + diastolic  1   603.68 622.71
## + serum      1   604.52 623.55
## 
## Step:  AIC=593.56
## diabete ~ plasma + bodymass
## 
##             Df Deviance    AIC
## + pregnant   1   558.42 583.79
## + age        1   560.88 586.25
## <none>           574.53 593.56
## + pedigree   1   570.87 596.24
## + diastolic  1   571.74 597.11
## + alea2      1   572.49 597.86
## + serum      1   572.94 598.31
## + alea1      1   573.92 599.29
## + triceps    1   574.30 599.67
## 
## Step:  AIC=583.79
## diabete ~ plasma + bodymass + pregnant
## 
##             Df Deviance    AIC
## <none>           558.42 583.79
## + diastolic  1   553.22 584.93
## + pedigree   1   553.51 585.22
## + age        1   555.47 587.18
## + alea2      1   556.93 588.64
## + alea1      1   557.58 589.29
## + serum      1   557.63 589.35
## + triceps    1   558.33 590.04
#modèle définitif
summary(modele.forward)
## 
## Call:
## glm(formula = diabete ~ plasma + bodymass + pregnant, family = binomial, 
##     data = donnees.app)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1344  -0.7399  -0.4508   0.7968   2.7796  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.879901   0.727777 -10.827  < 2e-16 ***
## plasma       0.032016   0.003716   8.617  < 2e-16 ***
## bodymass     0.084730   0.015931   5.319 1.05e-07 ***
## pregnant     0.122258   0.030885   3.958 7.54e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731.97  on 567  degrees of freedom
## Residual deviance: 558.42  on 564  degrees of freedom
## AIC: 566.42
## 
## Number of Fisher Scoring iterations: 5
# Q.24 - ordre de sélection des variables
# accès de nouveau au champ ANOVA
print(modele.forward$anova)
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## diabete ~ 1
## 
## Final Model:
## diabete ~ plasma + bodymass + pregnant
## 
## 
##         Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                               567   731.9720 738.3141
## 2   + plasma  1 127.30197       566   604.6700 617.3542
## 3 + bodymass  1  30.13784       565   574.5321 593.5585
## 4 + pregnant  1  16.11291       564   558.4192 583.7877
# graphique : évolution du critère
# en fonction du nombre de variables introduites
plot(0:(nrow(modele.forward$anova)-1),modele.forward$anova[,"AIC"],type="b",xlab="# de var. introdutes",ylab="BIC",main="Sélection forward (BIC)")