# Q.0 + 1
# librairie
library(xlsx)
# charger
donnees <- read.xlsx("Low_Birth_Weight_Data.xlsx",sheetIndex = 1, header=TRUE, stringsAsFactors=TRUE)
# premieres lignes
print(head(donnees))
## LOW AGE LWT SMOKE HT UI FTV PTL
## 1 y 28 120 1 0 1 0 1
## 2 y 29 130 0 0 1 1 0
## 3 y 34 187 1 1 0 0 0
## 4 y 25 105 0 1 0 0 1
## 5 y 25 85 0 0 1 0 0
## 6 y 27 150 0 0 0 0 0
# Q.1 - description
str(donnees)
## 'data.frame': 189 obs. of 8 variables:
## $ LOW : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ AGE : num 28 29 34 25 25 27 23 24 24 21 ...
## $ LWT : num 120 130 187 105 85 150 97 124 132 165 ...
## $ SMOKE: num 1 0 1 0 0 0 0 0 0 1 ...
## $ HT : num 0 0 1 1 0 0 0 0 1 1 ...
## $ UI : num 1 1 0 0 1 0 1 0 0 0 ...
## $ FTV : num 0 1 0 0 0 0 1 1 0 1 ...
## $ PTL : num 1 0 0 1 0 0 0 1 0 0 ...
# Q.2
# modèle : LOW vs. l'ensemble des variables explicatives
modele <- glm(LOW ~ ., data = donnees, family = binomial)
print(attributes(modele))
## $names
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
##
## $class
## [1] "glm" "lm"
# test du rapport de vraisemblance
# statistique de test
chi2 <- modele$null.deviance - modele$deviance
print(chi2)
## [1] 31.89948
# degrés de liberté (ddl)
ddl <- modele$df.null - modele$df.residual
print(ddl)
## [1] 7
# p-value < 0.05, modèle globalement significatif
print(pchisq(chi2,ddl,lower.tail=FALSE))
## [1] 4.239618e-05
# Q.3 - résultats détaillés - 3 coefficients (de variables) présentent une p-value < 0.05
print(summary(modele))
##
## Call:
## glm(formula = LOW ~ ., family = binomial, data = donnees)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8223 -0.7914 -0.5983 0.9523 2.0628
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.491472 1.115765 1.337 0.18131
## AGE -0.046676 0.036941 -1.264 0.20639
## LWT -0.013955 0.006766 -2.063 0.03915 *
## SMOKE 0.446009 0.355204 1.256 0.20924
## HT 1.832962 0.699416 2.621 0.00877 **
## UI 0.668032 0.464022 1.440 0.14996
## FTV -0.255064 0.364908 -0.699 0.48456
## PTL 1.336773 0.458652 2.915 0.00356 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 202.77 on 181 degrees of freedom
## AIC: 218.77
##
## Number of Fisher Scoring iterations: 4
# Q.4 - probabilités d'affectation - score d'appartenance - à LOW = y
# en resubstitution c.-à-d. calculées sur les données ayant servi à la modélisation
scores <- predict(modele, newdata = donnees, type = "response")
print(summary(scores))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03686 0.18203 0.26000 0.31217 0.40345 0.88381
# on aurait pu aussi récupérer les $fitted.values
print(summary(modele$fitted.values))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03686 0.18203 0.26000 0.31217 0.40345 0.88381
# Q.6 - boxplot des scores
boxplot(scores)
# Q.7 - afficher les boxplot selon les classes
boxplot(scores ~ donnees$LOW)
Vérifier si les scores modélisés sont compatibles avec les scores observés (proportion des observations appartenant à la modalité cible) lorsque les données sont organisées en groupes en fonction des scores.
# Q.8 + 9 - groupes de découpage
# groupes de largeur égales, seuils = 0.0, 0.25, 0.5, 0.75, 1.0
g1 <- cut(scores,breaks=seq(from=0.0, to=1.0, by=0.25),include.lowest=T)
print(table(g1))
## g1
## [0,0.25] (0.25,0.5] (0.5,0.75] (0.75,1]
## 91 64 28 6
# Q.10 - moyenne des probabilités(scores) par groupe
moy.scores <- tapply(scores,g1,mean)
print(moy.scores)
## [0,0.25] (0.25,0.5] (0.5,0.75] (0.75,1]
## 0.1650565 0.3416387 0.6147689 0.8169080
# Q.12 + 13 - proportion des positifs par groupe
prop.pos <- apply(table(g1,donnees$LOW),1,function(x){x[2]/sum(x)})
print(prop.pos)
## [0,0.25] (0.25,0.5] (0.5,0.75] (0.75,1]
## 0.1538462 0.3906250 0.5357143 0.8333333
# diagramme de fiabilité
# les couples de points forment (à peu près) une droite -> modèle bien calibré
# c.-à-d. fournit des scores qui sont en concordance avec les proportions des classes
# l'idée ici est opposer : scores modélisés vs. scores observés
plot(moy.scores,prop.pos,main="Diagramme de fiabilité",type="b",xlim=c(0,1),ylim=c(0,1))
abline(a=0,b=1)
# Q.14 + 15 + 16 - groupes avec les déciles cf. l'utilisation de quantile dans breaks
g2 <- cut(scores,breaks=quantile(scores,probs=seq(0.0,1.0,0.1)),include.lowest=TRUE)
print(table(g2))
## g2
## [0.0369,0.119] (0.119,0.16] (0.16,0.193] (0.193,0.22] (0.22,0.26]
## 19 19 19 19 19
## (0.26,0.296] (0.296,0.362] (0.362,0.469] (0.469,0.609] (0.609,0.884]
## 18 19 19 19 19
# Q.17 - somme des scores par groupe
sum.pos.scores <- tapply(scores,g2,sum)
print(sum.pos.scores)
## [0.0369,0.119] (0.119,0.16] (0.16,0.193] (0.193,0.22] (0.22,0.26]
## 1.617340 2.592616 3.366639 3.926826 4.546942
## (0.26,0.296] (0.296,0.362] (0.362,0.469] (0.469,0.609] (0.609,0.884]
## 4.976966 6.144608 7.769706 10.385999 13.672356
# Q.18 - complémentaire = n_g - somme des scores positifs
# somme des scores négatifs
sum.neg.scores <- table(g2)-sum.pos.scores
print(sum.neg.scores)
## g2
## [0.0369,0.119] (0.119,0.16] (0.16,0.193] (0.193,0.22] (0.22,0.26]
## 17.382660 16.407384 15.633361 15.073174 14.453058
## (0.26,0.296] (0.296,0.362] (0.362,0.469] (0.469,0.609] (0.609,0.884]
## 13.023034 12.855392 11.230294 8.614001 5.327644
# on aurait pu faire : 1-scores = proba. d'appartenir à la classe "LOW = n"
print(tapply(1-scores,g2,sum))
## [0.0369,0.119] (0.119,0.16] (0.16,0.193] (0.193,0.22] (0.22,0.26]
## 17.382660 16.407384 15.633361 15.073174 14.453058
## (0.26,0.296] (0.296,0.362] (0.362,0.469] (0.469,0.609] (0.609,0.884]
## 13.023034 12.855392 11.230294 8.614001 5.327644
# Q.19 - effectifs neg/pos par groupe
effectifs.neg.pos <- table(g2,donnees$LOW)
print(effectifs.neg.pos)
##
## g2 n y
## [0.0369,0.119] 19 0
## (0.119,0.16] 15 4
## (0.16,0.193] 18 1
## (0.193,0.22] 15 4
## (0.22,0.26] 12 7
## (0.26,0.296] 12 6
## (0.296,0.362] 14 5
## (0.362,0.469] 10 9
## (0.469,0.609] 9 10
## (0.609,0.884] 6 13
# Q.20 - statistique de Hosmer & Lemeshow
#pour les négatifs
C1 <- sum((effectifs.neg.pos[,1]-sum.neg.scores)^2/sum.neg.scores)
#pour les positifs
C2 <- sum((effectifs.neg.pos[,2]-sum.pos.scores)^2/sum.pos.scores)
#Statistique de Hosmer & Lemeshow
HL <- C1 + C2
print(HL)
## [1] 7.50089
# et p-value [loi du KHI-2 à (10 - 2) degrés de liberté]
# p-value > 0.05, les scores calculés sont compatibles avec les prop. observées des classes
# le modèle semble convenir
print(pchisq(HL,10-2,lower.tail=FALSE))
## [1] 0.4836754
# Q.22 - package spécialisé
library(generalhoslem)
## Le chargement a nécessité le package : reshape
## Le chargement a nécessité le package : MASS
# appel de la fonction
print(generalhoslem::logitgof(donnees$LOW,scores))
##
## Hosmer and Lemeshow test (binary model)
##
## data: donnees$LOW, scores
## X-squared = 7.5009, df = 8, p-value = 0.4837
# Q.23 - autre package
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
# appel de la fonction
ResourceSelection::hoslem.test(unclass(donnees$LOW)-1,scores)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: unclass(donnees$LOW) - 1, scores
## X-squared = 7.5009, df = 8, p-value = 0.4837