Importation, modélisation, évaluation

# brancher les packages
library(xlsx) #lecture directe des fichiers EXCEL

# Q.0 - charger les données d'apprentissage (sheetIndex=1)
DTrain <- read.xlsx(file="pima-non-linearite.xlsx",header=T,sheetIndex=1,stringsAsFactors=TRUE,encoding = "UTF-8")
str(DTrain)
## 'data.frame':    292 obs. of  5 variables:
##  $ plasma   : num  113 110 79 99 158 100 106 104 106 109 ...
##  $ diastolic: num  50 74 60 70 84 78 56 64 54 58 ...
##  $ triceps  : num  10 29 42 16 41 25 27 37 21 18 ...
##  $ serum    : num  85 125 48 44 210 184 165 64 158 116 ...
##  $ diabete  : Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# Q.1 - échantillon test (sheetIndex=2)
DTest <- read.xlsx(file="pima-non-linearite.xlsx",header=T,sheetIndex=2,stringsAsFactors=TRUE,encoding = "UTF-8")
str(DTest)
## 'data.frame':    1160 obs. of  5 variables:
##  $ plasma   : num  117 184 89 108 87 104 90 139 95 100 ...
##  $ diastolic: num  80 84 62 72 0 64 80 64 80 66 ...
##  $ triceps  : num  31 33 0 43 23 37 14 35 45 29 ...
##  $ serum    : num  53 0 0 75 0 64 55 140 92 196 ...
##  $ diabete  : Factor w/ 2 levels "negative","positive": 1 2 1 1 1 2 1 1 1 1 ...
# Q.2 - construire le modèle sans sélection de variables
modele_glm <- glm(diabete ~ ., data = DTrain, family = binomial)
summary(modele_glm)
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = DTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2955  -0.7346  -0.4205   0.7691   2.5119  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.3262939  1.1388424  -6.433 1.25e-10 ***
## plasma       0.0372648  0.0061634   6.046 1.48e-09 ***
## diastolic    0.0056375  0.0124232   0.454   0.6500    
## triceps      0.0473120  0.0149915   3.156   0.0016 ** 
## serum        0.0004106  0.0013703   0.300   0.7644    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 282.84  on 287  degrees of freedom
## AIC: 292.84
## 
## Number of Fisher Scoring iterations: 4
# Q.3 - prediction
pred1 <- factor(ifelse(predict(modele_glm,newdata=DTest,type="response")>0.5,"positive","negative"))
print(summary(pred1))
## negative positive 
##      925      235
# librairie caret 
# /!\ attention, il faut aussi installer manuellement e1071
library(caret)
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : ggplot2
# Q.4 - matrice de confusion + info. additionnelles
# attention, matrice de confusion transposée par rapport
# à notre présentation habituelle (en cours)
caret::confusionMatrix(data=pred1,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negative positive
##   negative      702      223
##   positive       60      175
##                                           
##                Accuracy : 0.756           
##                  95% CI : (0.7303, 0.7805)
##     No Information Rate : 0.6569          
##     P-Value [Acc > NIR] : 1.67e-13        
##                                           
##                   Kappa : 0.4001          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4397          
##             Specificity : 0.9213          
##          Pos Pred Value : 0.7447          
##          Neg Pred Value : 0.7589          
##              Prevalence : 0.3431          
##          Detection Rate : 0.1509          
##    Detection Prevalence : 0.2026          
##       Balanced Accuracy : 0.6805          
##                                           
##        'Positive' Class : positive        
## 

Test de Box-Tidwell

# Q.5 + 6
# effectuer le test de Box-Tidwell pour chaque variable
for (j in 1:4){
  Z <- DTrain[,j] * log(DTrain[,j])
  print(paste(">>> TEST pour ",toupper(colnames(DTrain)[j]),sep=""))
  modele.bt <- glm(diabete ~ ., data = cbind(DTrain,Z), family = binomial)
  print(summary(modele.bt))
}
## [1] ">>> TEST pour PLASMA"
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain, 
##     Z))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2206  -0.7311  -0.4080   0.7735   2.5309  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -9.2472786  5.6460563  -1.638  0.10146   
## plasma       0.1262044  0.2553627   0.494  0.62115   
## diastolic    0.0054256  0.0124475   0.436  0.66292   
## triceps      0.0469180  0.0150447   3.119  0.00182 **
## serum        0.0003827  0.0013588   0.282  0.77819   
## Z           -0.0151160  0.0433638  -0.349  0.72740   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 282.72  on 286  degrees of freedom
## AIC: 294.72
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] ">>> TEST pour DIASTOLIC"
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain, 
##     Z))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2641  -0.7239  -0.4127   0.7520   2.5302  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.8651303  4.7134016  -0.396   0.6923    
## plasma       0.0375388  0.0061820   6.072 1.26e-09 ***
## diastolic   -0.4227361  0.3661357  -1.155   0.2483    
## triceps      0.0456039  0.0150208   3.036   0.0024 ** 
## serum        0.0004353  0.0013754   0.317   0.7516    
## Z            0.0821871  0.0703965   1.167   0.2430    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 281.59  on 286  degrees of freedom
## AIC: 293.59
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] ">>> TEST pour TRICEPS"
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain, 
##     Z))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2907  -0.7308  -0.4070   0.7581   2.9793  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.061e+01  2.600e+00  -4.081 4.49e-05 ***
## plasma       3.617e-02  6.185e-03   5.848 4.98e-09 ***
## diastolic    7.117e-03  1.260e-02   0.565    0.572    
## triceps      5.514e-01  3.494e-01   1.578    0.115    
## serum        4.177e-04  1.373e-03   0.304    0.761    
## Z           -1.139e-01  7.865e-02  -1.449    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 280.57  on 286  degrees of freedom
## AIC: 292.57
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] ">>> TEST pour SERUM"
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain, 
##     Z))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9377  -0.7233  -0.3732   0.7422   2.3526  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.722036   1.302385  -6.697 2.13e-11 ***
## plasma       0.033574   0.006337   5.298 1.17e-07 ***
## diastolic    0.006834   0.012738   0.537  0.59160    
## triceps      0.043989   0.015172   2.899  0.00374 ** 
## serum        0.056763   0.020117   2.822  0.00478 ** 
## Z           -0.008487   0.003020  -2.810  0.00495 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 274.42  on 286  degrees of freedom
## AIC: 286.42
## 
## Number of Fisher Scoring iterations: 5

Graphique des résidus partiels

# Q.7

# package rms
library(rms)
## Le chargement a nécessité le package : Hmisc
## Le chargement a nécessité le package : survival
## 
## Attachement du package : 'survival'
## L'objet suivant est masqué depuis 'package:caret':
## 
##     cluster
## Le chargement a nécessité le package : Formula
## 
## Attachement du package : 'Hmisc'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     format.pval, units
## Le chargement a nécessité le package : SparseM
## 
## Attachement du package : 'SparseM'
## L'objet suivant est masqué depuis 'package:base':
## 
##     backsolve
# régression logistique
# les options x et y permettent d'obtenir les graphiques
# de la question suivante
modele <- rms::lrm(diabete ~ ., data = DTrain, x = TRUE, y = TRUE)
print(modele)
## Logistic Regression Model
##  
##  rms::lrm(formula = diabete ~ ., data = DTrain, x = TRUE, y = TRUE)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           292    LR chi2      87.00    R2       0.359    C       0.818    
##   negative     196    d.f.             4    g        1.576    Dxy     0.637    
##   positive      96    Pr(> chi2) <0.0001    gr       4.835    gamma   0.637    
##  max |deriv| 4e-10                          gp       0.278    tau-a   0.282    
##                                             Brier    0.159                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -7.3263 1.1389 -6.43  <0.0001 
##  plasma     0.0373 0.0062  6.05  <0.0001 
##  diastolic  0.0056 0.0124  0.45  0.6500  
##  triceps    0.0473 0.0150  3.16  0.0016  
##  serum      0.0004 0.0014  0.30  0.7645  
## 
# Q.8 + 9 - graphique des résidus partiels
# organisé dans un panneau 2 x 2
par(mfrow=c(2,2))
plot.lrm.partial(modele)

Régression sur variables transformées

# Q.10 + 11
# *****************************
# transformations des variables
# inspirées par le graphique
# *****************************

# prendre carré de diastolic
diastolic2 <- DTrain$diastolic^2
DTrainBis <- cbind(DTrain,diastolic2)

# prendre carré de serum
serum2 <- DTrain$serum^2
DTrainBis <- cbind(DTrainBis,serum2)

# discrétisation de triceps - codage non-emboîté
t1 <- ifelse(DTrain$triceps >= 20 & DTrain$triceps < 30,1,0)
DTrainBis <- cbind(DTrainBis,t1)

t2 <- ifelse(DTrain$triceps >= 30,1,0) 
DTrainBis <- cbind(DTrainBis,t2)

# nouvelle modélisation - modèle avec variables transformées
# le modèle colle mieux aux données cf. par ex. R2 de Nagelkerke
# on peut s'y attendre puisqu'on a augmenté artificiellement
# la dimensionnalité
modele.trans <- lrm(diabete ~ plasma+diastolic+diastolic2+triceps+t1+t2+serum+serum2,data=DTrainBis,x=T,y=T)
print(modele.trans)
## Logistic Regression Model
##  
##  lrm(formula = diabete ~ plasma + diastolic + diastolic2 + triceps + 
##      t1 + t2 + serum + serum2, data = DTrainBis, x = T, y = T)
##  
##                          Model Likelihood    Discrimination    Rank Discrim.    
##                                Ratio Test           Indexes          Indexes    
##  Obs            292    LR chi2      99.01    R2       0.400    C       0.829    
##   negative      196    d.f.             8    g        1.831    Dxy     0.658    
##   positive       96    Pr(> chi2) <0.0001    gr       6.239    gamma   0.658    
##  max |deriv| 0.0007                          gp       0.295    tau-a   0.291    
##                                              Brier    0.155                     
##  
##             Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  -3.7181 3.0346 -1.23  0.2205  
##  plasma      0.0348 0.0064  5.43  <0.0001 
##  diastolic  -0.1125 0.0861 -1.31  0.1913  
##  diastolic2  0.0009 0.0006  1.40  0.1615  
##  triceps    -0.0046 0.0298 -0.15  0.8778  
##  t1          1.2179 0.6281  1.94  0.0525  
##  t2          1.7845 0.8645  2.06  0.0390  
##  serum       0.0085 0.0037  2.30  0.0214  
##  serum2      0.0000 0.0000 -2.38  0.0175  
## 
# Q.12 - nouveau graphique des résidus partiels
par(mfrow=c(4,2))
plot.lrm.partial(modele.trans)

# Q.13 - création du modèle au format glm(.)
modele.trans.glm <- glm(diabete ~ plasma+diastolic+diastolic2+triceps+t1+t2+serum+serum2,data=DTrainBis,family = binomial)
print(summary(modele.trans.glm))
## 
## Call:
## glm(formula = diabete ~ plasma + diastolic + diastolic2 + triceps + 
##     t1 + t2 + serum + serum2, family = binomial, data = DTrainBis)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8122  -0.7321  -0.3622   0.7279   2.4041  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.718e+00  3.035e+00  -1.225   0.2205    
## plasma       3.479e-02  6.400e-03   5.435 5.48e-08 ***
## diastolic   -1.125e-01  8.607e-02  -1.307   0.1913    
## diastolic2   8.702e-04  6.215e-04   1.400   0.1615    
## triceps     -4.585e-03  2.982e-02  -0.154   0.8778    
## t1           1.218e+00  6.281e-01   1.939   0.0525 .  
## t2           1.785e+00  8.645e-01   2.064   0.0390 *  
## serum        8.466e-03  3.679e-03   2.301   0.0214 *  
## serum2      -1.261e-05  5.305e-06  -2.377   0.0175 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 270.84  on 283  degrees of freedom
## AIC: 288.84
## 
## Number of Fisher Scoring iterations: 5
# Q.14

# appliquer la transformation sur les données test
# prendre carré de diastolic
diastolic2 <- DTest$diastolic^2
DTestBis <- cbind(DTest,diastolic2)

# prendre carré de serum
serum2 <- DTest$serum^2
DTestBis <- cbind(DTestBis,serum2)

# discrétisation de triceps - codage non-emboîté
t1 <- ifelse(DTest$triceps >= 20 & DTest$triceps < 30,1,0)
DTestBis <- cbind(DTestBis,t1)

t2 <- ifelse(DTest$triceps >= 30,1,0) 
DTestBis <- cbind(DTestBis,t2)

#prédiction
pred2 <- factor(ifelse(predict(modele.trans.glm,newdata=DTestBis,type="response")>0.5,"positive","negative"))
print(summary(pred2))
## negative positive 
##      893      267
# Q.15 - performances en test
# mais ces nouvelles variables ne se traduisent pas en meilleures
# performances en test
# ce que l'on a gagné sur le biais, on le perd sur la variance
caret::confusionMatrix(data=pred2,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negative positive
##   negative      677      216
##   positive       85      182
##                                           
##                Accuracy : 0.7405          
##                  95% CI : (0.7143, 0.7655)
##     No Information Rate : 0.6569          
##     P-Value [Acc > NIR] : 5.203e-10       
##                                           
##                   Kappa : 0.3752          
##                                           
##  Mcnemar's Test P-Value : 6.728e-14       
##                                           
##             Sensitivity : 0.4573          
##             Specificity : 0.8885          
##          Pos Pred Value : 0.6816          
##          Neg Pred Value : 0.7581          
##              Prevalence : 0.3431          
##          Detection Rate : 0.1569          
##    Detection Prevalence : 0.2302          
##       Balanced Accuracy : 0.6729          
##                                           
##        'Positive' Class : positive        
## 

Traitement des non-linéarités par la discrétisation

# Q.16 + 17 + 18
#package
library(discretization)

# rappel configuration des données
str(DTrain)
## 'data.frame':    292 obs. of  5 variables:
##  $ plasma   : num  113 110 79 99 158 100 106 104 106 109 ...
##  $ diastolic: num  50 74 60 70 84 78 56 64 54 58 ...
##  $ triceps  : num  10 29 42 16 41 25 27 37 21 18 ...
##  $ serum    : num  85 125 48 44 210 184 165 64 158 116 ...
##  $ diabete  : Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# discrétisation supervisée
# guidée par la variable cible diabete
# en dernière position dans notre base
# parce que nous sommes dans une analyse prédictive
disc <- discretization::mdlp(DTrain)

# affichage des bornes de discrétisation
cat("\n\n")
print(disc$cutp)
## [[1]]
## [1] 127.5
## 
## [[2]]
## [1] "All"
## 
## [[3]]
## [1] 21.5
## 
## [[4]]
## [1] 87.5
# Q.19
# nouveau data frame
print(head(disc$Disc.data))
##   plasma diastolic triceps serum  diabete
## 1      1         1       1     1 negative
## 2      1         1       2     2 negative
## 3      1         1       2     1 negative
## 4      1         1       1     1 negative
## 5      2         1       2     2 positive
## 6      1         1       2     2 positive
# Q.20
# nouveau data frame pour l'apprentissage
DTrainDisc <- data.frame(lapply(disc$Disc.data[1:4],function(x){x-1}))

# retirer diastolic qui est une constante
DTrainDisc <- DTrainDisc[-2]

# ajouter la variable cible
DTrainDisc <- cbind(DTrainDisc,DTrain['diabete'])
str(DTrainDisc)
## 'data.frame':    292 obs. of  4 variables:
##  $ plasma : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ triceps: num  0 1 1 0 1 1 1 1 0 0 ...
##  $ serum  : num  0 1 0 0 1 1 1 0 1 1 ...
##  $ diabete: Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# Q.21 - refaire la régression

# avec lrm pour obtenir le R2
print(lrm(diabete ~ ., data = DTrainDisc, x = TRUE, y = TRUE))
## Logistic Regression Model
##  
##  lrm(formula = diabete ~ ., data = DTrainDisc, x = TRUE, y = TRUE)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           292    LR chi2     102.81    R2       0.413    C       0.825    
##   negative     196    d.f.             3    g        1.909    Dxy     0.651    
##   positive      96    Pr(> chi2) <0.0001    gr       6.748    gamma   0.779    
##  max |deriv| 3e-06                          gp       0.290    tau-a   0.288    
##                                             Brier    0.152                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -4.2304 0.6266 -6.75  <0.0001 
##  plasma     1.6432 0.3069  5.35  <0.0001 
##  triceps    1.5366 0.4619  3.33  0.0009  
##  serum      1.9021 0.5082  3.74  0.0002  
## 
# avec glm - pour pouvoir réaliser la prédiction en test
modele.disc <- glm(diabete ~ .,data=DTrainDisc,family = binomial)
print(summary(modele.disc))
## 
## Call:
## glm(formula = diabete ~ ., family = binomial, data = DTrainDisc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5537  -0.8646  -0.3618   0.8431   2.3491  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.2304     0.6266  -6.752 1.46e-11 ***
## plasma        1.6432     0.3069   5.354 8.61e-08 ***
## triceps       1.5366     0.4619   3.327 0.000879 ***
## serum         1.9021     0.5081   3.743 0.000182 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 369.85  on 291  degrees of freedom
## Residual deviance: 267.04  on 288  degrees of freedom
## AIC: 275.04
## 
## Number of Fisher Scoring iterations: 5
# Q.22 -- coder les variables en test
# /!\ en utilisant les bornes obtenues sur l'échantillon
# d'apprentissage (il ne s'agit pas de relancer le discrétisation) /!\

# plasma
plasmaTest <- ifelse(DTest$plasma > disc$cutp[[1]],1,0)
print(head(plasmaTest))
## [1] 0 1 0 0 0 0
# triceps
tricepsTest <- ifelse(DTest$triceps > disc$cutp[[3]], 1, 0)
print(head(tricepsTest))
## [1] 1 1 0 1 1 1
# serum
serumTest <- ifelse(DTest$serum > disc$cutp[[4]],1,0)
print(head(serumTest))
## [1] 0 0 0 0 0 0
# nouveau data frame
DTestDisc <- data.frame(plasmaTest,tricepsTest,serumTest)
colnames(DTestDisc) <- c('plasma','triceps','serum')
str(DTestDisc)
## 'data.frame':    1160 obs. of  3 variables:
##  $ plasma : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ triceps: num  1 1 0 1 1 1 0 1 1 1 ...
##  $ serum  : num  0 0 0 0 0 0 0 1 1 1 ...
#Q.23
# prédiction
pred3 <- factor(ifelse(predict(modele.disc,newdata=DTestDisc,type="response")>0.5,"positive","negative"))
print(summary(pred3))
## negative positive 
##      928      232
# évaluation
caret::confusionMatrix(data=pred3,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negative positive
##   negative      692      236
##   positive       70      162
##                                           
##                Accuracy : 0.7362          
##                  95% CI : (0.7098, 0.7614)
##     No Information Rate : 0.6569          
##     P-Value [Acc > NIR] : 3.766e-09       
##                                           
##                   Kappa : 0.35            
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4070          
##             Specificity : 0.9081          
##          Pos Pred Value : 0.6983          
##          Neg Pred Value : 0.7457          
##              Prevalence : 0.3431          
##          Detection Rate : 0.1397          
##    Detection Prevalence : 0.2000          
##       Balanced Accuracy : 0.6576          
##                                           
##        'Positive' Class : positive        
##