Régression sur données individuelles
# Q.0 + Q.1 - importer les données orginales
# raw dataset
library(xlsx)
DRaw <- read.xlsx("titanic.xlsx",header=TRUE,sheetName="RAW",stringsAsFactors=TRUE,encoding="UTF-8")
# afficher les premières lignes
print(head(DRaw))
## SURVIVANT CLASSE_ AGE_ SEXE_
## 1 YES 1ST ADULT MALE
## 2 YES 1ST ADULT MALE
## 3 YES 1ST ADULT MALE
## 4 YES 1ST ADULT MALE
## 5 YES 1ST ADULT MALE
## 6 YES 1ST ADULT MALE
# caractéristiques de la base
str(DRaw)
## 'data.frame': 2201 obs. of 4 variables:
## $ SURVIVANT: Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
## $ CLASSE_ : Factor w/ 4 levels "1ST","2ND","3RD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AGE_ : Factor w/ 2 levels "ADULT","CHILD": 1 1 1 1 1 1 1 1 1 1 ...
## $ SEXE_ : Factor w/ 2 levels "FEMALE","MALE": 2 2 2 2 2 2 2 2 2 2 ...
# Q.2 - distribution par variable
# Attention, marche uniquement parce que colonnes
# encodées en type "factor" de R
# (cf. option stringsAsFactors de read.xlsx)
print(summary(DRaw))
## SURVIVANT CLASSE_ AGE_ SEXE_
## NO :1490 1ST :325 ADULT:2092 FEMALE: 470
## YES: 711 2ND :285 CHILD: 109 MALE :1731
## 3RD :706
## CREW:885
# Q.3 + Q.4 - régression logistique avec ces données brutes
# l'interprétation est assez savoureuse (si on peut dire...)
mRaw <- glm(SURVIVANT ~ CLASSE_ + AGE_ + SEXE_, data = DRaw, family = binomial)
print(summary(mRaw))
##
## Call:
## glm(formula = SURVIVANT ~ CLASSE_ + AGE_ + SEXE_, family = binomial,
## data = DRaw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0812 -0.7149 -0.6656 0.6858 2.1278
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0438 0.1679 12.171 < 2e-16 ***
## CLASSE_2ND -1.0181 0.1960 -5.194 2.05e-07 ***
## CLASSE_3RD -1.7778 0.1716 -10.362 < 2e-16 ***
## CLASSE_CREW -0.8577 0.1573 -5.451 5.00e-08 ***
## AGE_CHILD 1.0615 0.2440 4.350 1.36e-05 ***
## SEXE_MALE -2.4201 0.1404 -17.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2769.5 on 2200 degrees of freedom
## Residual deviance: 2210.1 on 2195 degrees of freedom
## AIC: 2222.1
##
## Number of Fisher Scoring iterations: 4
# Q.5 - nombre de combinaisons théoriques possibles
print(nlevels(DRaw$CLASSE)*nlevels(DRaw$AGE)*nlevels(DRaw$SEXE))
## [1] 16
# Q.6 + Q.7 - calcul des effectifs selon les croisements
# nous constatons que certaines combinaisons n'existent pas
# on note surtout : pas d'enfants (AGE = CHILD)
# qui soient membres d'équipage (CLASSE = CREW)
# que ce soit pour les femmes ou pour les hommes
# => combinaisons (pattern) observées réellement = 14
print(table(DRaw[2:ncol(DRaw)]))
## , , SEXE_ = FEMALE
##
## AGE_
## CLASSE_ ADULT CHILD
## 1ST 144 1
## 2ND 93 13
## 3RD 165 31
## CREW 23 0
##
## , , SEXE_ = MALE
##
## AGE_
## CLASSE_ ADULT CHILD
## 1ST 175 5
## 2ND 168 11
## 3RD 462 48
## CREW 862 0
Régression sur données groupées
# Q.8 - chargement du second dataset
DPat <- read.xlsx("titanic.xlsx",header=TRUE,sheetName="PATTERN",stringsAsFactors=TRUE,encoding="UTF-8")
# afficher la base
print(DPat)
## SURV_YES SURV_NO CLASSE_ AGE_ SEXE_
## 1 140 4 1ST ADULT FEMALE
## 2 57 118 1ST ADULT MALE
## 3 1 0 1ST CHILD FEMALE
## 4 5 0 1ST CHILD MALE
## 5 80 13 2ND ADULT FEMALE
## 6 14 154 2ND ADULT MALE
## 7 13 0 2ND CHILD FEMALE
## 8 11 0 2ND CHILD MALE
## 9 76 89 3RD ADULT FEMALE
## 10 75 387 3RD ADULT MALE
## 11 14 17 3RD CHILD FEMALE
## 12 13 35 3RD CHILD MALE
## 13 20 3 CREW ADULT FEMALE
## 14 192 670 CREW ADULT MALE
# Q.9 - carac. de la base
str(DPat)
## 'data.frame': 14 obs. of 5 variables:
## $ SURV_YES: num 140 57 1 5 80 14 13 11 76 75 ...
## $ SURV_NO : num 4 118 0 0 13 154 0 0 89 387 ...
## $ CLASSE_ : Factor w/ 4 levels "1ST","2ND","3RD",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ AGE_ : Factor w/ 2 levels "ADULT","CHILD": 1 1 2 2 1 1 2 2 1 1 ...
## $ SEXE_ : Factor w/ 2 levels "FEMALE","MALE": 1 2 1 2 1 2 1 2 1 2 ...
# nombre de pattern (profils)
M <- nrow(DPat)
print(M)
## [1] 14
# Q.10 - créer la matrice qui corresponde à la variable cible
response <- as.matrix(DPat[c("SURV_YES","SURV_NO")])
print(response)
## SURV_YES SURV_NO
## [1,] 140 4
## [2,] 57 118
## [3,] 1 0
## [4,] 5 0
## [5,] 80 13
## [6,] 14 154
## [7,] 13 0
## [8,] 11 0
## [9,] 76 89
## [10,] 75 387
## [11,] 14 17
## [12,] 13 35
## [13,] 20 3
## [14,] 192 670
# Q.11
# response est une matrice à 2 colonnes :
# 1ere nb de positifs, 2nde, nb de négatifs
# coefficients estimés et écarts-type identiques que raw data
# MAIS deviance et degrés de liberté différents
mPat <- glm(response ~ CLASSE_ + AGE_ + SEXE_, data = DPat, family = binomial)
print(summary(mPat))
##
## Call:
## glm(formula = response ~ CLASSE_ + AGE_ + SEXE_, family = binomial,
## data = DPat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1356 -1.7126 0.7812 2.6800 4.3833
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0438 0.1679 12.171 < 2e-16 ***
## CLASSE_2ND -1.0181 0.1960 -5.194 2.05e-07 ***
## CLASSE_3RD -1.7778 0.1716 -10.362 < 2e-16 ***
## CLASSE_CREW -0.8577 0.1573 -5.451 5.00e-08 ***
## AGE_CHILD 1.0615 0.2440 4.350 1.36e-05 ***
## SEXE_MALE -2.4201 0.1404 -17.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 671.96 on 13 degrees of freedom
## Residual deviance: 112.57 on 8 degrees of freedom
## AIC: 171.19
##
## Number of Fisher Scoring iterations: 5
Levier
# Q.12 - calcul du levier
# de chaque profil
h <- hatvalues(mPat)
print(h)
## 1 2 3 4 5 6
## 0.412281480 0.783264508 0.003649587 0.083021759 0.503009111 0.646010794
## 7 8 9 10 11 12
## 0.102851092 0.193077689 0.677369351 0.693218980 0.315911994 0.507235025
## 13 14
## 0.103640654 0.975457975
# Q.13 - vérifier la somme des leviers
print(sum(h))
## [1] 6
# nombre de paramètres (coefficients) estimés dans le modèle
J <- length(mPat$coefficients)
print(J)
## [1] 6
# Q.14 - barplot
barplot(h,horiz=TRUE,las=2,main="Levier",cex.names=0.8)
# les profils correspondants aux 5 valeurs
# de levier les plus élevés
print(DPat[order(h,decreasing = TRUE)[1:5],])
## SURV_YES SURV_NO CLASSE_ AGE_ SEXE_
## 14 192 670 CREW ADULT MALE
## 2 57 118 1ST ADULT MALE
## 10 75 387 3RD ADULT MALE
## 9 76 89 3RD ADULT FEMALE
## 6 14 154 2ND ADULT MALE
# Q.15
# N effectif pour chaque pattern
N <- DPat$SURV_YES + DPat$SURV_NO
print(N)
## [1] 144 175 1 5 93 168 13 11 165 462 31 48 23 862
# correspondance avec les données -- afficher les effectifs
print(data.frame(N=N,DPat$CLASSE,DPat$AGE,DPat$SEXE)[order(h,decreasing = TRUE),])
## N DPat.CLASSE DPat.AGE DPat.SEXE
## 14 862 CREW ADULT MALE
## 2 175 1ST ADULT MALE
## 10 462 3RD ADULT MALE
## 9 165 3RD ADULT FEMALE
## 6 168 2ND ADULT MALE
## 12 48 3RD CHILD MALE
## 5 93 2ND ADULT FEMALE
## 1 144 1ST ADULT FEMALE
## 11 31 3RD CHILD FEMALE
## 8 11 2ND CHILD MALE
## 13 23 CREW ADULT FEMALE
## 7 13 2ND CHILD FEMALE
## 4 5 1ST CHILD MALE
## 3 1 1ST CHILD FEMALE
Résidus de Pearson
# Q.16 - proba d'être positif pour chaque pattern
# fournie par la régression logistique
proba <- predict(mPat,newdata=DPat,type="response")
print(proba)
## 1 2 3 4 5 6 7 8
## 0.8853234 0.4070382 0.9571141 0.6649249 0.7360897 0.1987193 0.8896612 0.4175655
## 9 10 11 12 13 14
## 0.5661291 0.1039594 0.7904463 0.2511586 0.7660538 0.2254997
# Q.17 - calcul du résidus du Pearson
# colonne Y -- nombre de positifs pour chaque pattern
Y <- DPat$SURV_YES
print(Y)
## [1] 140 57 1 5 80 14 13 11 76 75 14 13 20 192
# YHAT, estimation du nombre de positifs pour chaque pattern
# à partir des probabilités fournies par la régression logistique
YHAT <- N * proba
print(YHAT)
## 1 2 3 4 5 6
## 127.4865756 71.2316857 0.9571141 3.3246245 68.4563377 33.3848470
## 7 8 9 10 11 12
## 11.5655953 4.5932200 93.4113049 48.0292490 24.5038347 12.0556113
## 13 14
## 17.6192376 194.3807624
# résidu de Pearson
res_pearson <- (Y - YHAT)/sqrt(N*proba*(1.0-proba))
print(res_pearson)
## 1 2 3 4 5 6 7
## 3.2727032 -2.1898097 0.2116778 1.5873393 2.7158675 -3.7479635 1.2697655
## 8 9 10 11 12 13 14
## 3.9170366 -2.7349623 4.1112729 -4.6353568 0.3143122 1.1726395 -0.1940344
# Q.18 - vérification
# avec la fonction dédiée de R
residuals(mPat,"pearson")
## 1 2 3 4 5 6 7
## 3.2727032 -2.1898097 0.2116778 1.5873393 2.7158675 -3.7479635 1.2697655
## 8 9 10 11 12 13 14
## 3.9170366 -2.7349623 4.1112729 -4.6353568 0.3143122 1.1726395 -0.1940344
# Q.19 - statistique de Pearson
stat_pearson <- sum(res_pearson^2)
print(stat_pearson)
## [1] 103.8296
# proba. critique -- p-value
# attention aux degrés de liberté
print(pchisq(stat_pearson,M-J,lower.tail = FALSE))
## [1] 7.026538e-19
# Q.20 - résidu standardisé de Pearson
rstd_pearson <- rstandard(mPat,type="pearson")
print(rstd_pearson)
## 1 2 3 4 5 6 7
## 4.2689585 -4.7037189 0.2120651 1.6576402 3.8524265 -6.2994167 1.3405755
## 8 9 10 11 12 13 14
## 4.3605549 -4.8150249 7.4227025 -5.6043749 0.4477559 1.2385782 -1.2385782
# Q.21 - contrib. Pearson
contrib_pearson <- rstd_pearson^2
print(contrib_pearson)
## 1 2 3 4 5 6
## 18.22400660 22.12497135 0.04497163 2.74777088 14.84118980 39.68265099
## 7 8 9 10 11 12
## 1.79714255 19.01443935 23.18446512 55.09651300 31.40901795 0.20048534
## 13 14
## 1.53407601 1.53407600
# graphique
barplot(contrib_pearson,horiz=TRUE,las=2,main="Contrib.Pearson",cex.names=0.8)
# Q.22
# revenir aux données - écart entre SURV=YES obs. et estimé, effectif
# les profils n°10 et n°6 correspondent aux carac. enoncées
# ce sont des profils dont la mortalité est pathologique
# l'écart entre effectifs observés et modélisés est important pour ces profils
print(data.frame(ECART=Y-YHAT,N=N,DPat$CLASSE,DPat$AGE,DPat$SEXE))
## ECART N DPat.CLASSE DPat.AGE DPat.SEXE
## 1 12.51342436 144 1ST ADULT FEMALE
## 2 -14.23168570 175 1ST ADULT MALE
## 3 0.04288589 1 1ST CHILD FEMALE
## 4 1.67537546 5 1ST CHILD MALE
## 5 11.54366229 93 2ND ADULT FEMALE
## 6 -19.38484698 168 2ND ADULT MALE
## 7 1.43440470 13 2ND CHILD FEMALE
## 8 6.40677999 11 2ND CHILD MALE
## 9 -17.41130494 165 3RD ADULT FEMALE
## 10 26.97075098 462 3RD ADULT MALE
## 11 -10.50383473 31 3RD CHILD FEMALE
## 12 0.94438869 48 3RD CHILD MALE
## 13 2.38076244 23 CREW ADULT FEMALE
## 14 -2.38076244 862 CREW ADULT MALE
# Q.23 -- Khi-deux de Pearson sans N°10 et n°6
khi2_pearson_bis <- stat_pearson - sum(contrib_pearson[c(6,10)])
print(khi2_pearson_bis)
## [1] 9.050429
# proba critique associée
# sans ces deux profils, la modélisation serait satisfaisante
# attention aux degrés de liberté encore une fois
print(pchisq(khi2_pearson_bis,(M-2)-J,lower.tail=FALSE))
## [1] 0.1707618
Rédidus déviance
# Q.24 - calculs pour les résidus déviance
# résidu déviance
res_dev <- residuals(mPat,"deviance")
print(res_dev)
## 1 2 3 4 5 6 7
## 3.8566494 -2.2186924 0.2960833 2.0201019 2.8999600 -4.1355521 1.7434963
## 8 9 10 11 12 13 14
## 4.3832530 -2.7195669 3.8349477 -4.1272775 0.3116936 1.2507319 -0.1943169
# stat deviance
stat_deviance <- sum(res_dev^2)
print(stat_deviance)
## [1] 112.5666
# p-value
print(pchisq(stat_deviance,M-J,lower.tail = FALSE))
## [1] 1.1293e-20
# Q.25
# "residual deviance" fournie par glm(.)
# appliquée sur les données groupées
print(summary(mPat))
##
## Call:
## glm(formula = response ~ CLASSE_ + AGE_ + SEXE_, family = binomial,
## data = DPat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1356 -1.7126 0.7812 2.6800 4.3833
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0438 0.1679 12.171 < 2e-16 ***
## CLASSE_2ND -1.0181 0.1960 -5.194 2.05e-07 ***
## CLASSE_3RD -1.7778 0.1716 -10.362 < 2e-16 ***
## CLASSE_CREW -0.8577 0.1573 -5.451 5.00e-08 ***
## AGE_CHILD 1.0615 0.2440 4.350 1.36e-05 ***
## SEXE_MALE -2.4201 0.1404 -17.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 671.96 on 13 degrees of freedom
## Residual deviance: 112.57 on 8 degrees of freedom
## AIC: 171.19
##
## Number of Fisher Scoring iterations: 5
# Q.26 -- résidu standardisé deviance
rstd_dev <- rstandard(mPat,type="deviance")
print(rstd_dev)
## 1 2 3 4 5 6 7
## 5.0306658 -4.7657589 0.2966251 2.1095691 4.1135595 -6.9508591 1.8407244
## 8 9 10 11 12 13 14
## 4.8795602 -4.7879205 6.9238109 -4.9900820 0.4440255 1.3210618 -1.2403815
# contrib deviance
contrib_dev <- rstd_dev^2
# barplot
barplot(contrib_dev,horiz=TRUE,las=2,main="Contrib.Deviance",cex.names=0.8)
Distance de Cook
# Q.27 + Q.28 - distance de Cook
dcook <- cooks.distance(mPat)
print(dcook)
## 1 2 3 4 5 6
## 2.130674e+00 1.332631e+01 2.745484e-05 4.146314e-02 2.503484e+00 1.206978e+01
## 7 8 9 10 11 12
## 3.433805e-02 7.582853e-01 8.112706e+00 2.074984e+01 2.417449e+00 3.439543e-02
## 13 14
## 2.956267e-02 1.016234e+01
#graphique
barplot(dcook,horiz=TRUE,las=2,main="Distance de Cook",cex.names=0.8)
# Q.29 - rajouter l'interaction MALE et CLASSE
mPatBis <- glm(response ~ CLASSE_ * SEXE_ + AGE_, data = DPat, family = binomial)
print(summary(mPatBis))
##
## Call:
## glm(formula = response ~ CLASSE_ * SEXE_ + AGE_, family = binomial,
## data = DPat)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## -0.0050 -0.3259 0.1406 2.2841 -0.1998 -1.8814 1.1625 5.1564
## 9 10 11 12 13 14
## 1.0832 0.4503 -2.5433 -1.0917 0.0000 0.0000
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5579 0.5071 7.017 2.27e-12 ***
## CLASSE_2ND -1.6806 0.5878 -2.859 0.00425 **
## CLASSE_3RD -3.8854 0.5287 -7.350 1.99e-13 ***
## CLASSE_CREW -1.6608 0.8003 -2.075 0.03797 *
## SEXE_MALE -4.2331 0.5310 -7.972 1.56e-15 ***
## AGE_CHILD 1.0537 0.2304 4.573 4.81e-06 ***
## CLASSE_2ND:SEXE_MALE 0.4483 0.6460 0.694 0.48772
## CLASSE_3RD:SEXE_MALE 2.8625 0.5633 5.082 3.73e-07 ***
## CLASSE_CREW:SEXE_MALE 1.0862 0.8197 1.325 0.18516
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 671.962 on 13 degrees of freedom
## Residual deviance: 45.899 on 5 degrees of freedom
## AIC: 110.52
##
## Number of Fisher Scoring iterations: 5
# Q.30 pour tester la pertinence de l'interaction
# statistique du test du rapport de varisemblance
LR <- mPat$deviance - mPatBis$deviance
print(LR)
## [1] 66.66739
# p-value
# attention aux degrés de liberté
print(pchisq(LR,mPat$df.residual-mPatBis$df.residual,lower.tail=FALSE))
## [1] 2.206127e-14
# Q.31 - test de la déviance
# est-ce que pour autant le modèle est compatible avec les données ?
# non, il manque des informations pour expliquer convenablement la survie
print(pchisq(mPatBis$deviance,mPatBis$df.residual,lower.tail=FALSE))
## [1] 9.521525e-09