Importation, modélisation, scores

# Q.0 + 1
# librairie
library(xlsx)

# charger
donnees <- read.xlsx("Low_Birth_Weight_Data.xlsx",sheetIndex = 1, header=TRUE, stringsAsFactors=TRUE)

# premieres lignes
print(head(donnees))
##   LOW AGE LWT SMOKE HT UI FTV PTL
## 1   y  28 120     1  0  1   0   1
## 2   y  29 130     0  0  1   1   0
## 3   y  34 187     1  1  0   0   0
## 4   y  25 105     0  1  0   0   1
## 5   y  25  85     0  0  1   0   0
## 6   y  27 150     0  0  0   0   0
# Q.1 - description
str(donnees)
## 'data.frame':    189 obs. of  8 variables:
##  $ LOW  : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ AGE  : num  28 29 34 25 25 27 23 24 24 21 ...
##  $ LWT  : num  120 130 187 105 85 150 97 124 132 165 ...
##  $ SMOKE: num  1 0 1 0 0 0 0 0 0 1 ...
##  $ HT   : num  0 0 1 1 0 0 0 0 1 1 ...
##  $ UI   : num  1 1 0 0 1 0 1 0 0 0 ...
##  $ FTV  : num  0 1 0 0 0 0 1 1 0 1 ...
##  $ PTL  : num  1 0 0 1 0 0 0 1 0 0 ...
# Q.2
# modèle : LOW vs. l'ensemble des variables explicatives
modele <- glm(LOW ~ ., data = donnees, family = binomial)
print(attributes(modele))
## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"          
## 
## $class
## [1] "glm" "lm"
# test du rapport de vraisemblance
# statistique de test
chi2 <- modele$null.deviance - modele$deviance
print(chi2)
## [1] 31.89948
# degrés de liberté (ddl)
ddl <- modele$df.null - modele$df.residual
print(ddl)
## [1] 7
# p-value < 0.05, modèle globalement significatif
print(pchisq(chi2,ddl,lower.tail=FALSE))
## [1] 4.239618e-05
# Q.3 - résultats détaillés - 3 coefficients (de variables) présentent une p-value < 0.05
print(summary(modele))
## 
## Call:
## glm(formula = LOW ~ ., family = binomial, data = donnees)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8223  -0.7914  -0.5983   0.9523   2.0628  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.491472   1.115765   1.337  0.18131   
## AGE         -0.046676   0.036941  -1.264  0.20639   
## LWT         -0.013955   0.006766  -2.063  0.03915 * 
## SMOKE        0.446009   0.355204   1.256  0.20924   
## HT           1.832962   0.699416   2.621  0.00877 **
## UI           0.668032   0.464022   1.440  0.14996   
## FTV         -0.255064   0.364908  -0.699  0.48456   
## PTL          1.336773   0.458652   2.915  0.00356 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 202.77  on 181  degrees of freedom
## AIC: 218.77
## 
## Number of Fisher Scoring iterations: 4
# Q.4 - probabilités d'affectation - score d'appartenance - à LOW = y
# en resubstitution c.-à-d. calculées sur les données ayant servi à la modélisation
scores <- predict(modele, newdata = donnees, type = "response")
print(summary(scores))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03686 0.18203 0.26000 0.31217 0.40345 0.88381
# on aurait pu aussi récupérer les $fitted.values
print(summary(modele$fitted.values))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03686 0.18203 0.26000 0.31217 0.40345 0.88381
# Q.6 - boxplot des scores
boxplot(scores)

# Q.7 - afficher les boxplot selon les classes
boxplot(scores ~ donnees$LOW)

Diagramme de fiabilité

Vérifier si les scores modélisés sont compatibles avec les scores observés (proportion des observations appartenant à la modalité cible) lorsque les données sont organisées en groupes en fonction des scores.

# Q.8 + 9 - groupes de découpage
# groupes de largeur égales, seuils = 0.0, 0.25, 0.5, 0.75, 1.0
g1 <- cut(scores,breaks=seq(from=0.0, to=1.0, by=0.25),include.lowest=T)
print(table(g1))
## g1
##   [0,0.25] (0.25,0.5] (0.5,0.75]   (0.75,1] 
##         91         64         28          6
# Q.10 - moyenne des probabilités(scores) par groupe
moy.scores <- tapply(scores,g1,mean)
print(moy.scores)
##   [0,0.25] (0.25,0.5] (0.5,0.75]   (0.75,1] 
##  0.1650565  0.3416387  0.6147689  0.8169080
# Q.12 + 13 - proportion des positifs par groupe
prop.pos <- apply(table(g1,donnees$LOW),1,function(x){x[2]/sum(x)})
print(prop.pos)
##   [0,0.25] (0.25,0.5] (0.5,0.75]   (0.75,1] 
##  0.1538462  0.3906250  0.5357143  0.8333333
# diagramme de fiabilité
# les couples de points forment (à peu près) une droite -> modèle bien calibré
# c.-à-d. fournit des scores qui sont en concordance avec les proportions des classes
# l'idée ici est opposer : scores modélisés vs. scores observés
plot(moy.scores,prop.pos,main="Diagramme de fiabilité",type="b",xlim=c(0,1),ylim=c(0,1))
abline(a=0,b=1)

Test de Hosmer et Lemeshow

# Q.14 + 15 + 16 - groupes avec les déciles cf. l'utilisation de quantile dans breaks
g2 <- cut(scores,breaks=quantile(scores,probs=seq(0.0,1.0,0.1)),include.lowest=TRUE)
print(table(g2))
## g2
## [0.0369,0.119]   (0.119,0.16]   (0.16,0.193]   (0.193,0.22]    (0.22,0.26] 
##             19             19             19             19             19 
##   (0.26,0.296]  (0.296,0.362]  (0.362,0.469]  (0.469,0.609]  (0.609,0.884] 
##             18             19             19             19             19
# Q.17 - somme des scores par groupe
sum.pos.scores <- tapply(scores,g2,sum)
print(sum.pos.scores)
## [0.0369,0.119]   (0.119,0.16]   (0.16,0.193]   (0.193,0.22]    (0.22,0.26] 
##       1.617340       2.592616       3.366639       3.926826       4.546942 
##   (0.26,0.296]  (0.296,0.362]  (0.362,0.469]  (0.469,0.609]  (0.609,0.884] 
##       4.976966       6.144608       7.769706      10.385999      13.672356
# Q.18 - complémentaire = n_g - somme des scores positifs
# somme des scores négatifs
sum.neg.scores <- table(g2)-sum.pos.scores
print(sum.neg.scores)
## g2
## [0.0369,0.119]   (0.119,0.16]   (0.16,0.193]   (0.193,0.22]    (0.22,0.26] 
##      17.382660      16.407384      15.633361      15.073174      14.453058 
##   (0.26,0.296]  (0.296,0.362]  (0.362,0.469]  (0.469,0.609]  (0.609,0.884] 
##      13.023034      12.855392      11.230294       8.614001       5.327644
# on aurait pu faire : 1-scores = proba. d'appartenir à la classe "LOW = n"
print(tapply(1-scores,g2,sum))
## [0.0369,0.119]   (0.119,0.16]   (0.16,0.193]   (0.193,0.22]    (0.22,0.26] 
##      17.382660      16.407384      15.633361      15.073174      14.453058 
##   (0.26,0.296]  (0.296,0.362]  (0.362,0.469]  (0.469,0.609]  (0.609,0.884] 
##      13.023034      12.855392      11.230294       8.614001       5.327644
# Q.19 - effectifs neg/pos par groupe
effectifs.neg.pos <- table(g2,donnees$LOW)
print(effectifs.neg.pos)
##                 
## g2                n  y
##   [0.0369,0.119] 19  0
##   (0.119,0.16]   15  4
##   (0.16,0.193]   18  1
##   (0.193,0.22]   15  4
##   (0.22,0.26]    12  7
##   (0.26,0.296]   12  6
##   (0.296,0.362]  14  5
##   (0.362,0.469]  10  9
##   (0.469,0.609]   9 10
##   (0.609,0.884]   6 13
# Q.20 - statistique de Hosmer & Lemeshow

#pour les négatifs
C1 <- sum((effectifs.neg.pos[,1]-sum.neg.scores)^2/sum.neg.scores)

#pour les positifs
C2 <- sum((effectifs.neg.pos[,2]-sum.pos.scores)^2/sum.pos.scores)

#Statistique de Hosmer & Lemeshow
HL <- C1 + C2
print(HL)
## [1] 7.50089
# et p-value [loi du KHI-2 à (10 - 2) degrés de liberté]
# p-value > 0.05, les scores calculés sont compatibles avec les prop. observées des classes
# le modèle semble convenir
print(pchisq(HL,10-2,lower.tail=FALSE))
## [1] 0.4836754
# Q.22 - package spécialisé
library(generalhoslem)
## Le chargement a nécessité le package : reshape
## Le chargement a nécessité le package : MASS
# appel de la fonction
print(generalhoslem::logitgof(donnees$LOW,scores))
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  donnees$LOW, scores
## X-squared = 7.5009, df = 8, p-value = 0.4837
# Q.23 - autre package
library(ResourceSelection)
## ResourceSelection 0.3-5   2019-07-22
# appel de la fonction
ResourceSelection::hoslem.test(unclass(donnees$LOW)-1,scores)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  unclass(donnees$LOW) - 1, scores
## X-squared = 7.5009, df = 8, p-value = 0.4837