Régression sur données individuelles

# Q.0 + Q.1 - importer les données orginales
# raw dataset
library(xlsx)
DRaw <- read.xlsx("titanic.xlsx",header=TRUE,sheetName="RAW",stringsAsFactors=TRUE,encoding="UTF-8")

# afficher les premières lignes
print(head(DRaw))
##   SURVIVANT CLASSE_  AGE_ SEXE_
## 1       YES     1ST ADULT  MALE
## 2       YES     1ST ADULT  MALE
## 3       YES     1ST ADULT  MALE
## 4       YES     1ST ADULT  MALE
## 5       YES     1ST ADULT  MALE
## 6       YES     1ST ADULT  MALE
# caractéristiques de la base
str(DRaw)
## 'data.frame':    2201 obs. of  4 variables:
##  $ SURVIVANT: Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
##  $ CLASSE_  : Factor w/ 4 levels "1ST","2ND","3RD",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AGE_     : Factor w/ 2 levels "ADULT","CHILD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SEXE_    : Factor w/ 2 levels "FEMALE","MALE": 2 2 2 2 2 2 2 2 2 2 ...
# Q.2 - distribution par variable
# Attention, marche uniquement parce que colonnes
# encodées en type "factor" de R 
# (cf. option stringsAsFactors de read.xlsx)
print(summary(DRaw))
##  SURVIVANT  CLASSE_       AGE_         SEXE_     
##  NO :1490   1ST :325   ADULT:2092   FEMALE: 470  
##  YES: 711   2ND :285   CHILD: 109   MALE  :1731  
##             3RD :706                             
##             CREW:885
# Q.3 + Q.4 - régression logistique avec ces données brutes
# l'interprétation est assez savoureuse (si on peut dire...)
mRaw <- glm(SURVIVANT ~ CLASSE_ + AGE_ + SEXE_, data = DRaw, family = binomial)
print(summary(mRaw))
## 
## Call:
## glm(formula = SURVIVANT ~ CLASSE_ + AGE_ + SEXE_, family = binomial, 
##     data = DRaw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0812  -0.7149  -0.6656   0.6858   2.1278  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.0438     0.1679  12.171  < 2e-16 ***
## CLASSE_2ND   -1.0181     0.1960  -5.194 2.05e-07 ***
## CLASSE_3RD   -1.7778     0.1716 -10.362  < 2e-16 ***
## CLASSE_CREW  -0.8577     0.1573  -5.451 5.00e-08 ***
## AGE_CHILD     1.0615     0.2440   4.350 1.36e-05 ***
## SEXE_MALE    -2.4201     0.1404 -17.236  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2769.5  on 2200  degrees of freedom
## Residual deviance: 2210.1  on 2195  degrees of freedom
## AIC: 2222.1
## 
## Number of Fisher Scoring iterations: 4
# Q.5 - nombre de combinaisons théoriques possibles
print(nlevels(DRaw$CLASSE)*nlevels(DRaw$AGE)*nlevels(DRaw$SEXE))
## [1] 16
# Q.6 + Q.7 - calcul des effectifs selon les croisements
# nous constatons que certaines combinaisons n'existent pas
# on note surtout : pas d'enfants (AGE = CHILD) 
# qui soient membres d'équipage (CLASSE = CREW)
# que ce soit pour les femmes ou pour les hommes
# => combinaisons (pattern) observées réellement = 14
print(table(DRaw[2:ncol(DRaw)]))
## , , SEXE_ = FEMALE
## 
##        AGE_
## CLASSE_ ADULT CHILD
##    1ST    144     1
##    2ND     93    13
##    3RD    165    31
##    CREW    23     0
## 
## , , SEXE_ = MALE
## 
##        AGE_
## CLASSE_ ADULT CHILD
##    1ST    175     5
##    2ND    168    11
##    3RD    462    48
##    CREW   862     0

Régression sur données groupées

# Q.8 - chargement du second dataset
DPat <- read.xlsx("titanic.xlsx",header=TRUE,sheetName="PATTERN",stringsAsFactors=TRUE,encoding="UTF-8")

# afficher la base
print(DPat)
##    SURV_YES SURV_NO CLASSE_  AGE_  SEXE_
## 1       140       4     1ST ADULT FEMALE
## 2        57     118     1ST ADULT   MALE
## 3         1       0     1ST CHILD FEMALE
## 4         5       0     1ST CHILD   MALE
## 5        80      13     2ND ADULT FEMALE
## 6        14     154     2ND ADULT   MALE
## 7        13       0     2ND CHILD FEMALE
## 8        11       0     2ND CHILD   MALE
## 9        76      89     3RD ADULT FEMALE
## 10       75     387     3RD ADULT   MALE
## 11       14      17     3RD CHILD FEMALE
## 12       13      35     3RD CHILD   MALE
## 13       20       3    CREW ADULT FEMALE
## 14      192     670    CREW ADULT   MALE
# Q.9 - carac. de la base
str(DPat)
## 'data.frame':    14 obs. of  5 variables:
##  $ SURV_YES: num  140 57 1 5 80 14 13 11 76 75 ...
##  $ SURV_NO : num  4 118 0 0 13 154 0 0 89 387 ...
##  $ CLASSE_ : Factor w/ 4 levels "1ST","2ND","3RD",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ AGE_    : Factor w/ 2 levels "ADULT","CHILD": 1 1 2 2 1 1 2 2 1 1 ...
##  $ SEXE_   : Factor w/ 2 levels "FEMALE","MALE": 1 2 1 2 1 2 1 2 1 2 ...
# nombre de pattern (profils)
M <- nrow(DPat)
print(M)
## [1] 14
# Q.10 - créer la matrice qui corresponde à la variable cible
response <- as.matrix(DPat[c("SURV_YES","SURV_NO")])
print(response)
##       SURV_YES SURV_NO
##  [1,]      140       4
##  [2,]       57     118
##  [3,]        1       0
##  [4,]        5       0
##  [5,]       80      13
##  [6,]       14     154
##  [7,]       13       0
##  [8,]       11       0
##  [9,]       76      89
## [10,]       75     387
## [11,]       14      17
## [12,]       13      35
## [13,]       20       3
## [14,]      192     670
# Q.11
# response est une matrice à 2 colonnes :
# 1ere nb de positifs, 2nde, nb de négatifs
# coefficients estimés et écarts-type identiques que raw data
# MAIS deviance et degrés de liberté différents
mPat <- glm(response ~ CLASSE_ + AGE_ + SEXE_, data = DPat, family = binomial)
print(summary(mPat))
## 
## Call:
## glm(formula = response ~ CLASSE_ + AGE_ + SEXE_, family = binomial, 
##     data = DPat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1356  -1.7126   0.7812   2.6800   4.3833  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.0438     0.1679  12.171  < 2e-16 ***
## CLASSE_2ND   -1.0181     0.1960  -5.194 2.05e-07 ***
## CLASSE_3RD   -1.7778     0.1716 -10.362  < 2e-16 ***
## CLASSE_CREW  -0.8577     0.1573  -5.451 5.00e-08 ***
## AGE_CHILD     1.0615     0.2440   4.350 1.36e-05 ***
## SEXE_MALE    -2.4201     0.1404 -17.236  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 671.96  on 13  degrees of freedom
## Residual deviance: 112.57  on  8  degrees of freedom
## AIC: 171.19
## 
## Number of Fisher Scoring iterations: 5

Levier

# Q.12 - calcul du levier
# de chaque profil
h <- hatvalues(mPat)
print(h)
##           1           2           3           4           5           6 
## 0.412281480 0.783264508 0.003649587 0.083021759 0.503009111 0.646010794 
##           7           8           9          10          11          12 
## 0.102851092 0.193077689 0.677369351 0.693218980 0.315911994 0.507235025 
##          13          14 
## 0.103640654 0.975457975
# Q.13 - vérifier la somme des leviers
print(sum(h))
## [1] 6
# nombre de paramètres (coefficients) estimés dans le modèle
J <- length(mPat$coefficients)
print(J)
## [1] 6
# Q.14 - barplot
barplot(h,horiz=TRUE,las=2,main="Levier",cex.names=0.8)

# les profils correspondants aux 5 valeurs
# de levier les plus élevés
print(DPat[order(h,decreasing = TRUE)[1:5],])
##    SURV_YES SURV_NO CLASSE_  AGE_  SEXE_
## 14      192     670    CREW ADULT   MALE
## 2        57     118     1ST ADULT   MALE
## 10       75     387     3RD ADULT   MALE
## 9        76      89     3RD ADULT FEMALE
## 6        14     154     2ND ADULT   MALE
# Q.15
# N effectif pour chaque pattern
N <- DPat$SURV_YES + DPat$SURV_NO
print(N)
##  [1] 144 175   1   5  93 168  13  11 165 462  31  48  23 862
# correspondance avec les données -- afficher les effectifs
print(data.frame(N=N,DPat$CLASSE,DPat$AGE,DPat$SEXE)[order(h,decreasing = TRUE),])
##      N DPat.CLASSE DPat.AGE DPat.SEXE
## 14 862        CREW    ADULT      MALE
## 2  175         1ST    ADULT      MALE
## 10 462         3RD    ADULT      MALE
## 9  165         3RD    ADULT    FEMALE
## 6  168         2ND    ADULT      MALE
## 12  48         3RD    CHILD      MALE
## 5   93         2ND    ADULT    FEMALE
## 1  144         1ST    ADULT    FEMALE
## 11  31         3RD    CHILD    FEMALE
## 8   11         2ND    CHILD      MALE
## 13  23        CREW    ADULT    FEMALE
## 7   13         2ND    CHILD    FEMALE
## 4    5         1ST    CHILD      MALE
## 3    1         1ST    CHILD    FEMALE

Résidus de Pearson

# Q.16 - proba d'être positif pour chaque pattern
# fournie par la régression logistique
proba <- predict(mPat,newdata=DPat,type="response")
print(proba)
##         1         2         3         4         5         6         7         8 
## 0.8853234 0.4070382 0.9571141 0.6649249 0.7360897 0.1987193 0.8896612 0.4175655 
##         9        10        11        12        13        14 
## 0.5661291 0.1039594 0.7904463 0.2511586 0.7660538 0.2254997
# Q.17 - calcul du résidus du Pearson
# colonne Y -- nombre de positifs pour chaque pattern
Y <- DPat$SURV_YES
print(Y)
##  [1] 140  57   1   5  80  14  13  11  76  75  14  13  20 192
# YHAT, estimation du nombre de positifs pour chaque pattern
# à partir des probabilités fournies par la régression logistique
YHAT <- N * proba
print(YHAT)
##           1           2           3           4           5           6 
## 127.4865756  71.2316857   0.9571141   3.3246245  68.4563377  33.3848470 
##           7           8           9          10          11          12 
##  11.5655953   4.5932200  93.4113049  48.0292490  24.5038347  12.0556113 
##          13          14 
##  17.6192376 194.3807624
# résidu de Pearson
res_pearson <- (Y - YHAT)/sqrt(N*proba*(1.0-proba))
print(res_pearson)
##          1          2          3          4          5          6          7 
##  3.2727032 -2.1898097  0.2116778  1.5873393  2.7158675 -3.7479635  1.2697655 
##          8          9         10         11         12         13         14 
##  3.9170366 -2.7349623  4.1112729 -4.6353568  0.3143122  1.1726395 -0.1940344
# Q.18 - vérification
# avec la fonction dédiée de R
residuals(mPat,"pearson")
##          1          2          3          4          5          6          7 
##  3.2727032 -2.1898097  0.2116778  1.5873393  2.7158675 -3.7479635  1.2697655 
##          8          9         10         11         12         13         14 
##  3.9170366 -2.7349623  4.1112729 -4.6353568  0.3143122  1.1726395 -0.1940344
# Q.19 - statistique de Pearson
stat_pearson <- sum(res_pearson^2)
print(stat_pearson)
## [1] 103.8296
# proba. critique -- p-value
# attention aux degrés de liberté
print(pchisq(stat_pearson,M-J,lower.tail = FALSE))
## [1] 7.026538e-19
# Q.20 - résidu standardisé de Pearson
rstd_pearson <- rstandard(mPat,type="pearson")
print(rstd_pearson)
##          1          2          3          4          5          6          7 
##  4.2689585 -4.7037189  0.2120651  1.6576402  3.8524265 -6.2994167  1.3405755 
##          8          9         10         11         12         13         14 
##  4.3605549 -4.8150249  7.4227025 -5.6043749  0.4477559  1.2385782 -1.2385782
# Q.21 - contrib. Pearson
contrib_pearson <- rstd_pearson^2
print(contrib_pearson)
##           1           2           3           4           5           6 
## 18.22400660 22.12497135  0.04497163  2.74777088 14.84118980 39.68265099 
##           7           8           9          10          11          12 
##  1.79714255 19.01443935 23.18446512 55.09651300 31.40901795  0.20048534 
##          13          14 
##  1.53407601  1.53407600
# graphique
barplot(contrib_pearson,horiz=TRUE,las=2,main="Contrib.Pearson",cex.names=0.8)

# Q.22
# revenir aux données - écart entre SURV=YES obs. et estimé, effectif
# les profils n°10 et n°6 correspondent aux carac. enoncées
# ce sont des profils dont la mortalité est pathologique
# l'écart entre effectifs observés et modélisés est important pour ces profils
print(data.frame(ECART=Y-YHAT,N=N,DPat$CLASSE,DPat$AGE,DPat$SEXE))
##           ECART   N DPat.CLASSE DPat.AGE DPat.SEXE
## 1   12.51342436 144         1ST    ADULT    FEMALE
## 2  -14.23168570 175         1ST    ADULT      MALE
## 3    0.04288589   1         1ST    CHILD    FEMALE
## 4    1.67537546   5         1ST    CHILD      MALE
## 5   11.54366229  93         2ND    ADULT    FEMALE
## 6  -19.38484698 168         2ND    ADULT      MALE
## 7    1.43440470  13         2ND    CHILD    FEMALE
## 8    6.40677999  11         2ND    CHILD      MALE
## 9  -17.41130494 165         3RD    ADULT    FEMALE
## 10  26.97075098 462         3RD    ADULT      MALE
## 11 -10.50383473  31         3RD    CHILD    FEMALE
## 12   0.94438869  48         3RD    CHILD      MALE
## 13   2.38076244  23        CREW    ADULT    FEMALE
## 14  -2.38076244 862        CREW    ADULT      MALE
# Q.23 -- Khi-deux de Pearson sans N°10 et n°6
khi2_pearson_bis <- stat_pearson - sum(contrib_pearson[c(6,10)])
print(khi2_pearson_bis)
## [1] 9.050429
# proba critique associée
# sans ces deux profils, la modélisation serait satisfaisante
# attention aux degrés de liberté encore une fois
print(pchisq(khi2_pearson_bis,(M-2)-J,lower.tail=FALSE))
## [1] 0.1707618

Rédidus déviance

# Q.24 - calculs pour les résidus déviance

# résidu déviance
res_dev <- residuals(mPat,"deviance")
print(res_dev)
##          1          2          3          4          5          6          7 
##  3.8566494 -2.2186924  0.2960833  2.0201019  2.8999600 -4.1355521  1.7434963 
##          8          9         10         11         12         13         14 
##  4.3832530 -2.7195669  3.8349477 -4.1272775  0.3116936  1.2507319 -0.1943169
# stat deviance
stat_deviance <- sum(res_dev^2)
print(stat_deviance)
## [1] 112.5666
# p-value
print(pchisq(stat_deviance,M-J,lower.tail = FALSE))
## [1] 1.1293e-20
# Q.25
# "residual deviance" fournie par glm(.)
# appliquée sur les données groupées
print(summary(mPat))
## 
## Call:
## glm(formula = response ~ CLASSE_ + AGE_ + SEXE_, family = binomial, 
##     data = DPat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1356  -1.7126   0.7812   2.6800   4.3833  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.0438     0.1679  12.171  < 2e-16 ***
## CLASSE_2ND   -1.0181     0.1960  -5.194 2.05e-07 ***
## CLASSE_3RD   -1.7778     0.1716 -10.362  < 2e-16 ***
## CLASSE_CREW  -0.8577     0.1573  -5.451 5.00e-08 ***
## AGE_CHILD     1.0615     0.2440   4.350 1.36e-05 ***
## SEXE_MALE    -2.4201     0.1404 -17.236  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 671.96  on 13  degrees of freedom
## Residual deviance: 112.57  on  8  degrees of freedom
## AIC: 171.19
## 
## Number of Fisher Scoring iterations: 5
# Q.26 -- résidu standardisé deviance
rstd_dev <- rstandard(mPat,type="deviance")
print(rstd_dev)
##          1          2          3          4          5          6          7 
##  5.0306658 -4.7657589  0.2966251  2.1095691  4.1135595 -6.9508591  1.8407244 
##          8          9         10         11         12         13         14 
##  4.8795602 -4.7879205  6.9238109 -4.9900820  0.4440255  1.3210618 -1.2403815
# contrib deviance
contrib_dev <- rstd_dev^2

# barplot
barplot(contrib_dev,horiz=TRUE,las=2,main="Contrib.Deviance",cex.names=0.8)

Distance de Cook

# Q.27 + Q.28 - distance de Cook
dcook <- cooks.distance(mPat)
print(dcook)
##            1            2            3            4            5            6 
## 2.130674e+00 1.332631e+01 2.745484e-05 4.146314e-02 2.503484e+00 1.206978e+01 
##            7            8            9           10           11           12 
## 3.433805e-02 7.582853e-01 8.112706e+00 2.074984e+01 2.417449e+00 3.439543e-02 
##           13           14 
## 2.956267e-02 1.016234e+01
#graphique
barplot(dcook,horiz=TRUE,las=2,main="Distance de Cook",cex.names=0.8)

# Q.29 - rajouter l'interaction MALE et CLASSE
mPatBis <- glm(response ~ CLASSE_ * SEXE_ + AGE_, data = DPat, family = binomial)
print(summary(mPatBis))
## 
## Call:
## glm(formula = response ~ CLASSE_ * SEXE_ + AGE_, family = binomial, 
##     data = DPat)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6        7        8  
## -0.0050  -0.3259   0.1406   2.2841  -0.1998  -1.8814   1.1625   5.1564  
##       9       10       11       12       13       14  
##  1.0832   0.4503  -2.5433  -1.0917   0.0000   0.0000  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             3.5579     0.5071   7.017 2.27e-12 ***
## CLASSE_2ND             -1.6806     0.5878  -2.859  0.00425 ** 
## CLASSE_3RD             -3.8854     0.5287  -7.350 1.99e-13 ***
## CLASSE_CREW            -1.6608     0.8003  -2.075  0.03797 *  
## SEXE_MALE              -4.2331     0.5310  -7.972 1.56e-15 ***
## AGE_CHILD               1.0537     0.2304   4.573 4.81e-06 ***
## CLASSE_2ND:SEXE_MALE    0.4483     0.6460   0.694  0.48772    
## CLASSE_3RD:SEXE_MALE    2.8625     0.5633   5.082 3.73e-07 ***
## CLASSE_CREW:SEXE_MALE   1.0862     0.8197   1.325  0.18516    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 671.962  on 13  degrees of freedom
## Residual deviance:  45.899  on  5  degrees of freedom
## AIC: 110.52
## 
## Number of Fisher Scoring iterations: 5
# Q.30 pour tester la pertinence de l'interaction

# statistique du test du rapport de varisemblance
LR <- mPat$deviance - mPatBis$deviance
print(LR)
## [1] 66.66739
# p-value
# attention aux degrés de liberté
print(pchisq(LR,mPat$df.residual-mPatBis$df.residual,lower.tail=FALSE))
## [1] 2.206127e-14
# Q.31 - test de la déviance
# est-ce que pour autant le modèle est compatible avec les données ?
# non, il manque des informations pour expliquer convenablement la survie
print(pchisq(mPatBis$deviance,mPatBis$df.residual,lower.tail=FALSE))
## [1] 9.521525e-09