#préambule - fonction de calcul taux d'erreur
#modele est un objet issu de glm()
#donnees est un data frame des données "diabete"
calculer_taux_erreur <- function(modele,donnees){
#obtenir la proba d'être positif
pplus <- predict(modele,newdata=donnees,type="response")
#obtenir la prédiction
prediction <- as.factor(ifelse(pplus > 0.5,"positive","negative"))
#calculer la matrice de confusion
mc <- table(donnees.test$diabete,prediction)
cat("Matrice de confusion :\n")
print(mc)
#calculer le taux d'erreur
err.rate <- (mc[2,1]+mc[1,2])/sum(mc)
cat("\nTaux d'erreur : ")
cat(err.rate)
}
Importation et inspection des données
# Q.1 - charger le package "xlsx"
library(xlsx) #lecture directe des fichiers EXCEL
# Q.2 - charger les données apprentissage, attention aux options, notamment stringsAsFactors
donnees.app <- xlsx::read.xlsx(file="diabete_reg_logistique.xlsx",sheetIndex="apprentissage",stringsAsFactors=TRUE)
# Q.3 -- caractéristiques
str(donnees.app)
## 'data.frame': 568 obs. of 11 variables:
## $ pregnant : num 0 4 3 3 5 2 2 9 1 2 ...
## $ plasma : num 138 142 142 113 88 110 129 57 79 99 ...
## $ diastolic: num 0 86 80 50 78 74 84 80 60 70 ...
## $ triceps : num 0 0 15 10 30 29 0 37 42 16 ...
## $ serum : num 0 0 0 85 0 125 0 0 48 44 ...
## $ bodymass : num 36.3 44 32.4 29.5 27.6 32.4 28 32.8 43.5 20.4 ...
## $ pedigree : num 0.933 0.645 0.2 0.626 0.258 0.698 0.284 0.096 0.678 0.235 ...
## $ age : num 25 22 63 25 37 27 27 41 23 27 ...
## $ alea1 : num 0.338 0.835 0.493 0.857 0.045 0.382 0.992 0.618 0.736 0.478 ...
## $ alea2 : num 0.188 0.711 0.845 0.821 0.392 0.721 0.224 0.478 0.371 0.79 ...
## $ diabete : Factor w/ 2 levels "negative","positive": 2 2 1 1 1 1 1 1 1 1 ...
# Q.4 -- summary
summary(donnees.app)
## pregnant plasma diastolic triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.: 0.0
## Median : 3.000 Median :115.0 Median : 72.00 Median :23.0
## Mean : 3.778 Mean :120.3 Mean : 69.27 Mean :20.9
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:33.0
## Max. :15.000 Max. :198.0 Max. :122.00 Max. :99.0
## serum bodymass pedigree age
## Min. : 0.00 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:27.18 1st Qu.:0.2380 1st Qu.:24.00
## Median : 39.00 Median :31.75 Median :0.3690 Median :29.00
## Mean : 81.96 Mean :31.88 Mean :0.4612 Mean :33.23
## 3rd Qu.:125.25 3rd Qu.:36.50 3rd Qu.:0.6085 3rd Qu.:40.00
## Max. :846.00 Max. :55.00 Max. :2.3290 Max. :81.00
## alea1 alea2 diabete
## Min. :0.0080 Min. :0.002 negative:372
## 1st Qu.:0.2427 1st Qu.:0.281 positive:196
## Median :0.4845 Median :0.513
## Mean :0.4952 Mean :0.507
## 3rd Qu.:0.7388 3rd Qu.:0.743
## Max. :0.9980 Max. :0.999
#données test (Q.2 + 3 + 4)
donnees.test <- read.xlsx(file="diabete_reg_logistique.xlsx",sheetIndex="test",stringsAsFactors=TRUE)
#caractéristiques
str(donnees.test)
## 'data.frame': 200 obs. of 11 variables:
## $ pregnant : num 4 8 0 3 0 11 2 1 4 1 ...
## $ plasma : num 132 74 93 108 137 136 93 181 83 114 ...
## $ diastolic: num 0 70 60 62 68 84 64 78 86 66 ...
## $ triceps : num 0 40 25 24 14 35 32 42 19 36 ...
## $ serum : num 0 49 92 0 148 130 160 293 0 200 ...
## $ bodymass : num 32.9 35.3 28.7 26 24.8 28.3 38 40 29.3 38.1 ...
## $ pedigree : num 0.302 0.705 0.532 0.223 0.143 ...
## $ age : num 23 39 22 25 21 42 23 22 34 21 ...
## $ alea1 : num 0.714 0.749 0.868 0.144 0.544 0.06 0.897 0.606 0.031 0.14 ...
## $ alea2 : num 0.695 0.881 0.178 0.756 0.166 0.301 0.987 0.933 0.538 0.456 ...
## $ diabete : Factor w/ 2 levels "negative","positive": 2 1 1 1 1 2 2 2 1 1 ...
#summary
print(summary(donnees.test))
## pregnant plasma diastolic triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.:102.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :120.0 Median : 72.00 Median :22.00
## Mean : 4.035 Mean :122.6 Mean : 68.62 Mean :19.49
## 3rd Qu.: 6.000 3rd Qu.:139.5 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :110.00 Max. :63.00
## serum bodymass pedigree age
## Min. : 0.00 Min. : 0.00 Min. :0.1000 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:27.77 1st Qu.:0.2510 1st Qu.:24.00
## Median : 0.00 Median :32.80 Median :0.3820 Median :29.00
## Mean : 73.67 Mean :32.32 Mean :0.5023 Mean :33.27
## 3rd Qu.:131.25 3rd Qu.:36.90 3rd Qu.:0.6673 3rd Qu.:41.25
## Max. :543.00 Max. :67.10 Max. :2.4200 Max. :72.00
## alea1 alea2 diabete
## Min. :0.0000 Min. :0.0060 negative:128
## 1st Qu.:0.2020 1st Qu.:0.2218 positive: 72
## Median :0.5040 Median :0.4490
## Mean :0.5038 Mean :0.4698
## 3rd Qu.:0.7792 3rd Qu.:0.6955
## Max. :1.0000 Max. :0.9870
Modélisation et évaluation [statistique] du modèle (1)
# Q.5 + 6 - modélisation avec la régression logistique - diabete vs. les autres variables
# cf. option "family"
modele <- glm(diabete ~ ., data = donnees.app, family = binomial)
print(modele)
##
## Call: glm(formula = diabete ~ ., family = binomial, data = donnees.app)
##
## Coefficients:
## (Intercept) pregnant plasma diastolic triceps serum
## -8.1901884 0.0947479 0.0321814 -0.0164547 0.0029843 -0.0009302
## bodymass pedigree age alea1 alea2
## 0.0946971 0.7916786 0.0227797 -0.2241445 0.3815692
##
## Degrees of Freedom: 567 Total (i.e. Null); 557 Residual
## Null Deviance: 732
## Residual Deviance: 540.9 AIC: 562.9
# Q.7 - attributs de l'objet modele
print(attributes(modele))
## $names
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
##
## $class
## [1] "glm" "lm"
# Q.8 -- R2 McFadden
LLa <- modele$deviance/(-2)
LL0 <- modele$null.deviance/(-2)
R2MF <- 1.0 - LLa/LL0
print(R2MF)
## [1] 0.261014
#R2 Cox-Snell
La <- exp(LLa)
L0 <- exp(LL0)
R2CS <- 1.0 - (L0/La)^(2/nrow(donnees.app))
print(R2CS)
## [1] 0.2856372
#R2 Nagelkerke
R2N <- R2CS/(1.0 - L0^(2/nrow(donnees.app)))
print(R2N)
## [1] 0.3943269
# Q.9 - test de significativé globale
#chi-2 du rapport de vraisemblance
chi2 <- modele$null.deviance - modele$deviance
print(chi2)
## [1] 191.0549
#ddl - degré de liberté
ddl <- modele$df.null - modele$df.residual
print(ddl)
## [1] 10
#p-value
pvalue <- pchisq(chi2,ddl,lower.tail = FALSE)
print(pvalue)
## [1] 1.17931e-35
Evaluation [prédictive] du modèle (2)
# Q.10 - prédiction
# "response" qui revient à calculer le score d'affectation à la 2nd modalité (positive)
# si link, on aurait eu le logit c.-à-d. la combinaison linéaire + la constante
proba <- predict(modele, newdata = donnees.test, type = "response")
#affichage pour les premiers individus
head(proba)
## 1 2 3 4 5 6
## 0.60359199 0.23663992 0.06403929 0.12846841 0.10767907 0.44372734
# Q.11 - prédiction
# conversion des probabilités d'affectation en classes prédites
pred <- factor(ifelse(proba > 0.5, "positive", "negative"))
#distribution des classes prédites
print(table(pred))
## pred
## negative positive
## 139 61
# Q.12 - matrice de confusion : classes observées vs. classes prédites
mc <- table(donnees.test$diabete,pred)
print(mc)
## pred
## negative positive
## negative 115 13
## positive 24 48
#taux d'erreur
err <- 1.0-sum(diag(mc))/sum(mc)
print(err)
## [1] 0.185
#appel de la fonction pour le calcul du taux d'erreur
calculer_taux_erreur(modele,donnees.test)
## Matrice de confusion :
## prediction
## negative positive
## negative 115 13
## positive 24 48
##
## Taux d'erreur : 0.185
Evaluation des variables
# Q.13 + 14 - test de Wald pour l'évaluation des variables
res <- summary(modele)
print(res)
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = donnees.app)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4778 -0.7382 -0.4222 0.7302 2.7998
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.1901884 0.8869146 -9.234 < 2e-16 ***
## pregnant 0.0947479 0.0384014 2.467 0.0136 *
## plasma 0.0321814 0.0041123 7.826 5.05e-15 ***
## diastolic -0.0164547 0.0063177 -2.605 0.0092 **
## triceps 0.0029843 0.0078995 0.378 0.7056
## serum -0.0009302 0.0009879 -0.942 0.3464
## bodymass 0.0946971 0.0177019 5.350 8.82e-08 ***
## pedigree 0.7916786 0.3560964 2.223 0.0262 *
## age 0.0227797 0.0112745 2.020 0.0433 *
## alea1 -0.2241445 0.3727308 -0.601 0.5476
## alea2 0.3815692 0.3781735 1.009 0.3130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731.97 on 567 degrees of freedom
## Residual deviance: 540.92 on 557 degrees of freedom
## AIC: 562.92
##
## Number of Fisher Scoring iterations: 5
Sélection de variables “backward” - Critère AIC
# Q.15 - librairie MASS pour stepAIC
library(MASS)
#processus de sélection backward
modele.backward_AIC <- stepAIC(modele, data = donnees.app, direction="backward")
## Start: AIC=562.92
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass +
## pedigree + age + alea1 + alea2
##
## Df Deviance AIC
## - triceps 1 541.06 561.06
## - alea1 1 541.28 561.28
## - serum 1 541.80 561.80
## - alea2 1 541.94 561.94
## <none> 540.92 562.92
## - age 1 545.01 565.01
## - pedigree 1 546.01 566.01
## - pregnant 1 547.13 567.13
## - diastolic 1 547.94 567.94
## - bodymass 1 573.95 593.95
## - plasma 1 617.52 637.52
##
## Step: AIC=561.06
## diabete ~ pregnant + plasma + diastolic + serum + bodymass +
## pedigree + age + alea1 + alea2
##
## Df Deviance AIC
## - alea1 1 541.45 559.45
## - serum 1 541.80 559.80
## - alea2 1 542.10 560.10
## <none> 541.06 561.06
## - age 1 545.03 563.03
## - pedigree 1 546.37 564.37
## - pregnant 1 547.40 565.40
## - diastolic 1 547.95 565.95
## - bodymass 1 578.96 596.96
## - plasma 1 618.07 636.07
##
## Step: AIC=559.45
## diabete ~ pregnant + plasma + diastolic + serum + bodymass +
## pedigree + age + alea2
##
## Df Deviance AIC
## - serum 1 542.20 558.20
## - alea2 1 542.56 558.56
## <none> 541.45 559.45
## - age 1 545.46 561.46
## - pedigree 1 546.91 562.91
## - pregnant 1 547.68 563.68
## - diastolic 1 548.49 564.49
## - bodymass 1 579.80 595.80
## - plasma 1 618.56 634.56
##
## Step: AIC=558.2
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree +
## age + alea2
##
## Df Deviance AIC
## - alea2 1 543.50 557.50
## <none> 542.20 558.20
## - age 1 546.54 560.54
## - pedigree 1 547.24 561.24
## - pregnant 1 548.61 562.61
## - diastolic 1 549.34 563.34
## - bodymass 1 579.80 593.80
## - plasma 1 622.73 636.73
##
## Step: AIC=557.5
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree +
## age
##
## Df Deviance AIC
## <none> 543.50 557.50
## - age 1 547.93 559.93
## - pedigree 1 548.53 560.53
## - pregnant 1 549.99 561.99
## - diastolic 1 550.82 562.82
## - bodymass 1 581.16 593.16
## - plasma 1 624.35 636.35
#caractéristiques du modèle final
summary(modele.backward_AIC)
##
## Call:
## glm(formula = diabete ~ pregnant + plasma + diastolic + bodymass +
## pedigree + age, family = binomial, data = donnees.app)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5974 -0.7343 -0.4326 0.7434 2.7641
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.980598 0.819010 -9.744 < 2e-16 ***
## pregnant 0.095927 0.038057 2.521 0.01172 *
## plasma 0.030872 0.003829 8.062 7.49e-16 ***
## diastolic -0.016339 0.006154 -2.655 0.00794 **
## bodymass 0.094578 0.016654 5.679 1.35e-08 ***
## pedigree 0.773523 0.349523 2.213 0.02689 *
## age 0.023376 0.011110 2.104 0.03538 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731.97 on 567 degrees of freedom
## Residual deviance: 543.50 on 561 degrees of freedom
## AIC: 557.5
##
## Number of Fisher Scoring iterations: 5
# Q.16 - performances prédictives du modèle définitif
calculer_taux_erreur(modele.backward_AIC,donnees.test)
## Matrice de confusion :
## prediction
## negative positive
## negative 112 16
## positive 25 47
##
## Taux d'erreur : 0.205
Sélection de variables “backward” - Critère BIC
#selection backward - BIC - cf. le paramètre "k"
modele.backward_BIC <- stepAIC(modele, data = donnees.app, direction="backward", k = log(nrow(donnees.app)))
## Start: AIC=610.68
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass +
## pedigree + age + alea1 + alea2
##
## Df Deviance AIC
## - triceps 1 541.06 604.48
## - alea1 1 541.28 604.70
## - serum 1 541.80 605.22
## - alea2 1 541.94 605.36
## - age 1 545.01 608.44
## - pedigree 1 546.01 609.43
## - pregnant 1 547.13 610.55
## <none> 540.92 610.68
## - diastolic 1 547.94 611.36
## - bodymass 1 573.95 637.38
## - plasma 1 617.52 680.94
##
## Step: AIC=604.48
## diabete ~ pregnant + plasma + diastolic + serum + bodymass +
## pedigree + age + alea1 + alea2
##
## Df Deviance AIC
## - alea1 1 541.45 598.53
## - serum 1 541.80 598.88
## - alea2 1 542.10 599.17
## - age 1 545.03 602.11
## - pedigree 1 546.37 603.45
## - pregnant 1 547.40 604.48
## <none> 541.06 604.48
## - diastolic 1 547.95 605.03
## - bodymass 1 578.96 636.04
## - plasma 1 618.07 675.15
##
## Step: AIC=598.53
## diabete ~ pregnant + plasma + diastolic + serum + bodymass +
## pedigree + age + alea2
##
## Df Deviance AIC
## - serum 1 542.20 592.94
## - alea2 1 542.56 593.30
## - age 1 545.46 596.20
## - pedigree 1 546.91 597.65
## - pregnant 1 547.68 598.42
## <none> 541.45 598.53
## - diastolic 1 548.49 599.22
## - bodymass 1 579.80 630.54
## - plasma 1 618.56 669.30
##
## Step: AIC=592.94
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree +
## age + alea2
##
## Df Deviance AIC
## - alea2 1 543.50 587.90
## - age 1 546.54 590.94
## - pedigree 1 547.24 591.63
## <none> 542.20 592.94
## - pregnant 1 548.61 593.00
## - diastolic 1 549.34 593.73
## - bodymass 1 579.80 624.20
## - plasma 1 622.73 667.12
##
## Step: AIC=587.9
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree +
## age
##
## Df Deviance AIC
## - age 1 547.93 585.98
## - pedigree 1 548.53 586.58
## <none> 543.50 587.90
## - pregnant 1 549.99 588.05
## - diastolic 1 550.82 588.87
## - bodymass 1 581.16 619.21
## - plasma 1 624.35 662.41
##
## Step: AIC=585.98
## diabete ~ pregnant + plasma + diastolic + bodymass + pedigree
##
## Df Deviance AIC
## - pedigree 1 553.22 584.93
## - diastolic 1 553.51 585.22
## <none> 547.93 585.98
## - pregnant 1 567.89 599.60
## - bodymass 1 582.52 614.24
## - plasma 1 640.54 672.26
##
## Step: AIC=584.93
## diabete ~ pregnant + plasma + diastolic + bodymass
##
## Df Deviance AIC
## - diastolic 1 558.42 583.79
## <none> 553.22 584.93
## - pregnant 1 571.74 597.11
## - bodymass 1 588.67 614.04
## - plasma 1 649.73 675.10
##
## Step: AIC=583.79
## diabete ~ pregnant + plasma + bodymass
##
## Df Deviance AIC
## <none> 558.42 583.79
## - pregnant 1 574.53 593.56
## - bodymass 1 590.69 609.72
## - plasma 1 652.06 671.09
#carcatéristiques du modèle définitif
summary(modele.backward_BIC)
##
## Call:
## glm(formula = diabete ~ pregnant + plasma + bodymass, family = binomial,
## data = donnees.app)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1344 -0.7399 -0.4508 0.7968 2.7796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.879901 0.727777 -10.827 < 2e-16 ***
## pregnant 0.122258 0.030885 3.958 7.54e-05 ***
## plasma 0.032016 0.003716 8.617 < 2e-16 ***
## bodymass 0.084730 0.015931 5.319 1.05e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731.97 on 567 degrees of freedom
## Residual deviance: 558.42 on 564 degrees of freedom
## AIC: 566.42
##
## Number of Fisher Scoring iterations: 5
# Q.18 - performances en test
calculer_taux_erreur(modele.backward_BIC,donnees.test)
## Matrice de confusion :
## prediction
## negative positive
## negative 115 13
## positive 28 44
##
## Taux d'erreur : 0.205
# Q.19 - propriétés de l'objet issu de la sélection
print(attributes(modele.backward_BIC))
## $names
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
## [31] "anova"
##
## $class
## [1] "glm" "lm"
# Q.20 + 21 - le champ ANOVA
print(modele.backward_BIC$anova)
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## diabete ~ pregnant + plasma + diastolic + triceps + serum + bodymass +
## pedigree + age + alea1 + alea2
##
## Final Model:
## diabete ~ pregnant + plasma + bodymass
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 557 540.9170 610.6804
## 2 - triceps 1 0.1430769 558 541.0601 604.4813
## 3 - alea1 1 0.3915816 559 541.4517 598.5308
## 4 - serum 1 0.7517023 560 542.2034 592.9404
## 5 - alea2 1 1.2977440 561 543.5011 587.8960
## 6 - age 1 4.4277935 562 547.9289 585.9817
## 7 - pedigree 1 5.2896430 563 553.2186 584.9292
## 8 - diastolic 1 5.2006549 564 558.4192 583.7877
# Q.22 - graphique, évolution du critère BIC
# en fonction du nombre de variables retirées
plot(0:(nrow(modele.backward_BIC$anova)-1),modele.backward_BIC$anova[,"AIC"],type="b",xlab="# de var. retirées",ylab="BIC",main="Sélection backward (BIC)")
Sélection “forward” - Critère BIC
# Q.23 - sélection forward, critère BIC
#modèle initial : modèle réduit à la constante
modele.initial <- glm(diabete ~ 1, data = donnees.app, family = binomial)
print(modele.initial)
##
## Call: glm(formula = diabete ~ 1, family = binomial, data = donnees.app)
##
## Coefficients:
## (Intercept)
## -0.6408
##
## Degrees of Freedom: 567 Total (i.e. Null); 567 Residual
## Null Deviance: 732
## Residual Deviance: 732 AIC: 734
# sélection forward - k = critère BIC
# cf. le rôle du paramètre "scope"
modele.forward <- stepAIC(modele.initial, data = donnees.app, scope = list(lower="~1",upper=as.formula(modele)),direction="forward",k=log(nrow(donnees.app)))
## Start: AIC=738.31
## diabete ~ 1
##
## Df Deviance AIC
## + plasma 1 604.67 617.35
## + bodymass 1 677.00 689.69
## + age 1 701.29 713.98
## + pregnant 1 707.58 720.26
## + serum 1 721.15 733.84
## + pedigree 1 722.67 735.35
## <none> 731.97 738.31
## + triceps 1 727.35 740.03
## + alea2 1 729.48 742.16
## + alea1 1 730.57 743.26
## + diastolic 1 731.47 744.15
##
## Step: AIC=617.35
## diabete ~ plasma
##
## Df Deviance AIC
## + bodymass 1 574.53 593.56
## + pregnant 1 590.69 609.72
## + age 1 595.92 614.94
## <none> 604.67 617.35
## + pedigree 1 600.12 619.14
## + triceps 1 602.17 621.20
## + alea2 1 602.88 621.91
## + alea1 1 603.63 622.66
## + diastolic 1 603.68 622.71
## + serum 1 604.52 623.55
##
## Step: AIC=593.56
## diabete ~ plasma + bodymass
##
## Df Deviance AIC
## + pregnant 1 558.42 583.79
## + age 1 560.88 586.25
## <none> 574.53 593.56
## + pedigree 1 570.87 596.24
## + diastolic 1 571.74 597.11
## + alea2 1 572.49 597.86
## + serum 1 572.94 598.31
## + alea1 1 573.92 599.29
## + triceps 1 574.30 599.67
##
## Step: AIC=583.79
## diabete ~ plasma + bodymass + pregnant
##
## Df Deviance AIC
## <none> 558.42 583.79
## + diastolic 1 553.22 584.93
## + pedigree 1 553.51 585.22
## + age 1 555.47 587.18
## + alea2 1 556.93 588.64
## + alea1 1 557.58 589.29
## + serum 1 557.63 589.35
## + triceps 1 558.33 590.04
#modèle définitif
summary(modele.forward)
##
## Call:
## glm(formula = diabete ~ plasma + bodymass + pregnant, family = binomial,
## data = donnees.app)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1344 -0.7399 -0.4508 0.7968 2.7796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.879901 0.727777 -10.827 < 2e-16 ***
## plasma 0.032016 0.003716 8.617 < 2e-16 ***
## bodymass 0.084730 0.015931 5.319 1.05e-07 ***
## pregnant 0.122258 0.030885 3.958 7.54e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731.97 on 567 degrees of freedom
## Residual deviance: 558.42 on 564 degrees of freedom
## AIC: 566.42
##
## Number of Fisher Scoring iterations: 5
# Q.24 - ordre de sélection des variables
# accès de nouveau au champ ANOVA
print(modele.forward$anova)
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## diabete ~ 1
##
## Final Model:
## diabete ~ plasma + bodymass + pregnant
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 567 731.9720 738.3141
## 2 + plasma 1 127.30197 566 604.6700 617.3542
## 3 + bodymass 1 30.13784 565 574.5321 593.5585
## 4 + pregnant 1 16.11291 564 558.4192 583.7877
# graphique : évolution du critère
# en fonction du nombre de variables introduites
plot(0:(nrow(modele.forward$anova)-1),modele.forward$anova[,"AIC"],type="b",xlab="# de var. introdutes",ylab="BIC",main="Sélection forward (BIC)")