Importation, modélisation, évaluation
# brancher les packages
library(xlsx) #lecture directe des fichiers EXCEL
# Q.0 - charger les données d'apprentissage (sheetIndex=1)
DTrain <- read.xlsx(file="pima-non-linearite.xlsx",header=T,sheetIndex=1,stringsAsFactors=TRUE,encoding = "UTF-8")
str(DTrain)
## 'data.frame': 292 obs. of 5 variables:
## $ plasma : num 113 110 79 99 158 100 106 104 106 109 ...
## $ diastolic: num 50 74 60 70 84 78 56 64 54 58 ...
## $ triceps : num 10 29 42 16 41 25 27 37 21 18 ...
## $ serum : num 85 125 48 44 210 184 165 64 158 116 ...
## $ diabete : Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# Q.1 - échantillon test (sheetIndex=2)
DTest <- read.xlsx(file="pima-non-linearite.xlsx",header=T,sheetIndex=2,stringsAsFactors=TRUE,encoding = "UTF-8")
str(DTest)
## 'data.frame': 1160 obs. of 5 variables:
## $ plasma : num 117 184 89 108 87 104 90 139 95 100 ...
## $ diastolic: num 80 84 62 72 0 64 80 64 80 66 ...
## $ triceps : num 31 33 0 43 23 37 14 35 45 29 ...
## $ serum : num 53 0 0 75 0 64 55 140 92 196 ...
## $ diabete : Factor w/ 2 levels "negative","positive": 1 2 1 1 1 2 1 1 1 1 ...
# Q.2 - construire le modèle sans sélection de variables
modele_glm <- glm(diabete ~ ., data = DTrain, family = binomial)
summary(modele_glm)
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = DTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2955 -0.7346 -0.4205 0.7691 2.5119
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.3262939 1.1388424 -6.433 1.25e-10 ***
## plasma 0.0372648 0.0061634 6.046 1.48e-09 ***
## diastolic 0.0056375 0.0124232 0.454 0.6500
## triceps 0.0473120 0.0149915 3.156 0.0016 **
## serum 0.0004106 0.0013703 0.300 0.7644
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 282.84 on 287 degrees of freedom
## AIC: 292.84
##
## Number of Fisher Scoring iterations: 4
# Q.3 - prediction
pred1 <- factor(ifelse(predict(modele_glm,newdata=DTest,type="response")>0.5,"positive","negative"))
print(summary(pred1))
## negative positive
## 925 235
# librairie caret
# /!\ attention, il faut aussi installer manuellement e1071
library(caret)
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : ggplot2
# Q.4 - matrice de confusion + info. additionnelles
# attention, matrice de confusion transposée par rapport
# à notre présentation habituelle (en cours)
caret::confusionMatrix(data=pred1,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 702 223
## positive 60 175
##
## Accuracy : 0.756
## 95% CI : (0.7303, 0.7805)
## No Information Rate : 0.6569
## P-Value [Acc > NIR] : 1.67e-13
##
## Kappa : 0.4001
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4397
## Specificity : 0.9213
## Pos Pred Value : 0.7447
## Neg Pred Value : 0.7589
## Prevalence : 0.3431
## Detection Rate : 0.1509
## Detection Prevalence : 0.2026
## Balanced Accuracy : 0.6805
##
## 'Positive' Class : positive
##
Test de Box-Tidwell
# Q.5 + 6
# effectuer le test de Box-Tidwell pour chaque variable
for (j in 1:4){
Z <- DTrain[,j] * log(DTrain[,j])
print(paste(">>> TEST pour ",toupper(colnames(DTrain)[j]),sep=""))
modele.bt <- glm(diabete ~ ., data = cbind(DTrain,Z), family = binomial)
print(summary(modele.bt))
}
## [1] ">>> TEST pour PLASMA"
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain,
## Z))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2206 -0.7311 -0.4080 0.7735 2.5309
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.2472786 5.6460563 -1.638 0.10146
## plasma 0.1262044 0.2553627 0.494 0.62115
## diastolic 0.0054256 0.0124475 0.436 0.66292
## triceps 0.0469180 0.0150447 3.119 0.00182 **
## serum 0.0003827 0.0013588 0.282 0.77819
## Z -0.0151160 0.0433638 -0.349 0.72740
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 282.72 on 286 degrees of freedom
## AIC: 294.72
##
## Number of Fisher Scoring iterations: 5
##
## [1] ">>> TEST pour DIASTOLIC"
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain,
## Z))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2641 -0.7239 -0.4127 0.7520 2.5302
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8651303 4.7134016 -0.396 0.6923
## plasma 0.0375388 0.0061820 6.072 1.26e-09 ***
## diastolic -0.4227361 0.3661357 -1.155 0.2483
## triceps 0.0456039 0.0150208 3.036 0.0024 **
## serum 0.0004353 0.0013754 0.317 0.7516
## Z 0.0821871 0.0703965 1.167 0.2430
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 281.59 on 286 degrees of freedom
## AIC: 293.59
##
## Number of Fisher Scoring iterations: 4
##
## [1] ">>> TEST pour TRICEPS"
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain,
## Z))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2907 -0.7308 -0.4070 0.7581 2.9793
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.061e+01 2.600e+00 -4.081 4.49e-05 ***
## plasma 3.617e-02 6.185e-03 5.848 4.98e-09 ***
## diastolic 7.117e-03 1.260e-02 0.565 0.572
## triceps 5.514e-01 3.494e-01 1.578 0.115
## serum 4.177e-04 1.373e-03 0.304 0.761
## Z -1.139e-01 7.865e-02 -1.449 0.147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 280.57 on 286 degrees of freedom
## AIC: 292.57
##
## Number of Fisher Scoring iterations: 5
##
## [1] ">>> TEST pour SERUM"
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = cbind(DTrain,
## Z))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9377 -0.7233 -0.3732 0.7422 2.3526
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.722036 1.302385 -6.697 2.13e-11 ***
## plasma 0.033574 0.006337 5.298 1.17e-07 ***
## diastolic 0.006834 0.012738 0.537 0.59160
## triceps 0.043989 0.015172 2.899 0.00374 **
## serum 0.056763 0.020117 2.822 0.00478 **
## Z -0.008487 0.003020 -2.810 0.00495 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 274.42 on 286 degrees of freedom
## AIC: 286.42
##
## Number of Fisher Scoring iterations: 5
Graphique des résidus partiels
# Q.7
# package rms
library(rms)
## Le chargement a nécessité le package : Hmisc
## Le chargement a nécessité le package : survival
##
## Attachement du package : 'survival'
## L'objet suivant est masqué depuis 'package:caret':
##
## cluster
## Le chargement a nécessité le package : Formula
##
## Attachement du package : 'Hmisc'
## Les objets suivants sont masqués depuis 'package:base':
##
## format.pval, units
## Le chargement a nécessité le package : SparseM
##
## Attachement du package : 'SparseM'
## L'objet suivant est masqué depuis 'package:base':
##
## backsolve
# régression logistique
# les options x et y permettent d'obtenir les graphiques
# de la question suivante
modele <- rms::lrm(diabete ~ ., data = DTrain, x = TRUE, y = TRUE)
print(modele)
## Logistic Regression Model
##
## rms::lrm(formula = diabete ~ ., data = DTrain, x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 292 LR chi2 87.00 R2 0.359 C 0.818
## negative 196 d.f. 4 g 1.576 Dxy 0.637
## positive 96 Pr(> chi2) <0.0001 gr 4.835 gamma 0.637
## max |deriv| 4e-10 gp 0.278 tau-a 0.282
## Brier 0.159
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -7.3263 1.1389 -6.43 <0.0001
## plasma 0.0373 0.0062 6.05 <0.0001
## diastolic 0.0056 0.0124 0.45 0.6500
## triceps 0.0473 0.0150 3.16 0.0016
## serum 0.0004 0.0014 0.30 0.7645
##
# Q.8 + 9 - graphique des résidus partiels
# organisé dans un panneau 2 x 2
par(mfrow=c(2,2))
plot.lrm.partial(modele)
Régression sur variables transformées
# Q.10 + 11
# *****************************
# transformations des variables
# inspirées par le graphique
# *****************************
# prendre carré de diastolic
diastolic2 <- DTrain$diastolic^2
DTrainBis <- cbind(DTrain,diastolic2)
# prendre carré de serum
serum2 <- DTrain$serum^2
DTrainBis <- cbind(DTrainBis,serum2)
# discrétisation de triceps - codage non-emboîté
t1 <- ifelse(DTrain$triceps >= 20 & DTrain$triceps < 30,1,0)
DTrainBis <- cbind(DTrainBis,t1)
t2 <- ifelse(DTrain$triceps >= 30,1,0)
DTrainBis <- cbind(DTrainBis,t2)
# nouvelle modélisation - modèle avec variables transformées
# le modèle colle mieux aux données cf. par ex. R2 de Nagelkerke
# on peut s'y attendre puisqu'on a augmenté artificiellement
# la dimensionnalité
modele.trans <- lrm(diabete ~ plasma+diastolic+diastolic2+triceps+t1+t2+serum+serum2,data=DTrainBis,x=T,y=T)
print(modele.trans)
## Logistic Regression Model
##
## lrm(formula = diabete ~ plasma + diastolic + diastolic2 + triceps +
## t1 + t2 + serum + serum2, data = DTrainBis, x = T, y = T)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 292 LR chi2 99.01 R2 0.400 C 0.829
## negative 196 d.f. 8 g 1.831 Dxy 0.658
## positive 96 Pr(> chi2) <0.0001 gr 6.239 gamma 0.658
## max |deriv| 0.0007 gp 0.295 tau-a 0.291
## Brier 0.155
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -3.7181 3.0346 -1.23 0.2205
## plasma 0.0348 0.0064 5.43 <0.0001
## diastolic -0.1125 0.0861 -1.31 0.1913
## diastolic2 0.0009 0.0006 1.40 0.1615
## triceps -0.0046 0.0298 -0.15 0.8778
## t1 1.2179 0.6281 1.94 0.0525
## t2 1.7845 0.8645 2.06 0.0390
## serum 0.0085 0.0037 2.30 0.0214
## serum2 0.0000 0.0000 -2.38 0.0175
##
# Q.12 - nouveau graphique des résidus partiels
par(mfrow=c(4,2))
plot.lrm.partial(modele.trans)
# Q.13 - création du modèle au format glm(.)
modele.trans.glm <- glm(diabete ~ plasma+diastolic+diastolic2+triceps+t1+t2+serum+serum2,data=DTrainBis,family = binomial)
print(summary(modele.trans.glm))
##
## Call:
## glm(formula = diabete ~ plasma + diastolic + diastolic2 + triceps +
## t1 + t2 + serum + serum2, family = binomial, data = DTrainBis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8122 -0.7321 -0.3622 0.7279 2.4041
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.718e+00 3.035e+00 -1.225 0.2205
## plasma 3.479e-02 6.400e-03 5.435 5.48e-08 ***
## diastolic -1.125e-01 8.607e-02 -1.307 0.1913
## diastolic2 8.702e-04 6.215e-04 1.400 0.1615
## triceps -4.585e-03 2.982e-02 -0.154 0.8778
## t1 1.218e+00 6.281e-01 1.939 0.0525 .
## t2 1.785e+00 8.645e-01 2.064 0.0390 *
## serum 8.466e-03 3.679e-03 2.301 0.0214 *
## serum2 -1.261e-05 5.305e-06 -2.377 0.0175 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 270.84 on 283 degrees of freedom
## AIC: 288.84
##
## Number of Fisher Scoring iterations: 5
# Q.14
# appliquer la transformation sur les données test
# prendre carré de diastolic
diastolic2 <- DTest$diastolic^2
DTestBis <- cbind(DTest,diastolic2)
# prendre carré de serum
serum2 <- DTest$serum^2
DTestBis <- cbind(DTestBis,serum2)
# discrétisation de triceps - codage non-emboîté
t1 <- ifelse(DTest$triceps >= 20 & DTest$triceps < 30,1,0)
DTestBis <- cbind(DTestBis,t1)
t2 <- ifelse(DTest$triceps >= 30,1,0)
DTestBis <- cbind(DTestBis,t2)
#prédiction
pred2 <- factor(ifelse(predict(modele.trans.glm,newdata=DTestBis,type="response")>0.5,"positive","negative"))
print(summary(pred2))
## negative positive
## 893 267
# Q.15 - performances en test
# mais ces nouvelles variables ne se traduisent pas en meilleures
# performances en test
# ce que l'on a gagné sur le biais, on le perd sur la variance
caret::confusionMatrix(data=pred2,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 677 216
## positive 85 182
##
## Accuracy : 0.7405
## 95% CI : (0.7143, 0.7655)
## No Information Rate : 0.6569
## P-Value [Acc > NIR] : 5.203e-10
##
## Kappa : 0.3752
##
## Mcnemar's Test P-Value : 6.728e-14
##
## Sensitivity : 0.4573
## Specificity : 0.8885
## Pos Pred Value : 0.6816
## Neg Pred Value : 0.7581
## Prevalence : 0.3431
## Detection Rate : 0.1569
## Detection Prevalence : 0.2302
## Balanced Accuracy : 0.6729
##
## 'Positive' Class : positive
##
Traitement des non-linéarités par la discrétisation
# Q.16 + 17 + 18
#package
library(discretization)
# rappel configuration des données
str(DTrain)
## 'data.frame': 292 obs. of 5 variables:
## $ plasma : num 113 110 79 99 158 100 106 104 106 109 ...
## $ diastolic: num 50 74 60 70 84 78 56 64 54 58 ...
## $ triceps : num 10 29 42 16 41 25 27 37 21 18 ...
## $ serum : num 85 125 48 44 210 184 165 64 158 116 ...
## $ diabete : Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# discrétisation supervisée
# guidée par la variable cible diabete
# en dernière position dans notre base
# parce que nous sommes dans une analyse prédictive
disc <- discretization::mdlp(DTrain)
# affichage des bornes de discrétisation
cat("\n\n")
print(disc$cutp)
## [[1]]
## [1] 127.5
##
## [[2]]
## [1] "All"
##
## [[3]]
## [1] 21.5
##
## [[4]]
## [1] 87.5
# Q.19
# nouveau data frame
print(head(disc$Disc.data))
## plasma diastolic triceps serum diabete
## 1 1 1 1 1 negative
## 2 1 1 2 2 negative
## 3 1 1 2 1 negative
## 4 1 1 1 1 negative
## 5 2 1 2 2 positive
## 6 1 1 2 2 positive
# Q.20
# nouveau data frame pour l'apprentissage
DTrainDisc <- data.frame(lapply(disc$Disc.data[1:4],function(x){x-1}))
# retirer diastolic qui est une constante
DTrainDisc <- DTrainDisc[-2]
# ajouter la variable cible
DTrainDisc <- cbind(DTrainDisc,DTrain['diabete'])
str(DTrainDisc)
## 'data.frame': 292 obs. of 4 variables:
## $ plasma : num 0 0 0 0 1 0 0 0 0 0 ...
## $ triceps: num 0 1 1 0 1 1 1 1 0 0 ...
## $ serum : num 0 1 0 0 1 1 1 0 1 1 ...
## $ diabete: Factor w/ 2 levels "negative","positive": 1 1 1 1 2 2 1 2 1 1 ...
# Q.21 - refaire la régression
# avec lrm pour obtenir le R2
print(lrm(diabete ~ ., data = DTrainDisc, x = TRUE, y = TRUE))
## Logistic Regression Model
##
## lrm(formula = diabete ~ ., data = DTrainDisc, x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 292 LR chi2 102.81 R2 0.413 C 0.825
## negative 196 d.f. 3 g 1.909 Dxy 0.651
## positive 96 Pr(> chi2) <0.0001 gr 6.748 gamma 0.779
## max |deriv| 3e-06 gp 0.290 tau-a 0.288
## Brier 0.152
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -4.2304 0.6266 -6.75 <0.0001
## plasma 1.6432 0.3069 5.35 <0.0001
## triceps 1.5366 0.4619 3.33 0.0009
## serum 1.9021 0.5082 3.74 0.0002
##
# avec glm - pour pouvoir réaliser la prédiction en test
modele.disc <- glm(diabete ~ .,data=DTrainDisc,family = binomial)
print(summary(modele.disc))
##
## Call:
## glm(formula = diabete ~ ., family = binomial, data = DTrainDisc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5537 -0.8646 -0.3618 0.8431 2.3491
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.2304 0.6266 -6.752 1.46e-11 ***
## plasma 1.6432 0.3069 5.354 8.61e-08 ***
## triceps 1.5366 0.4619 3.327 0.000879 ***
## serum 1.9021 0.5081 3.743 0.000182 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 369.85 on 291 degrees of freedom
## Residual deviance: 267.04 on 288 degrees of freedom
## AIC: 275.04
##
## Number of Fisher Scoring iterations: 5
# Q.22 -- coder les variables en test
# /!\ en utilisant les bornes obtenues sur l'échantillon
# d'apprentissage (il ne s'agit pas de relancer le discrétisation) /!\
# plasma
plasmaTest <- ifelse(DTest$plasma > disc$cutp[[1]],1,0)
print(head(plasmaTest))
## [1] 0 1 0 0 0 0
# triceps
tricepsTest <- ifelse(DTest$triceps > disc$cutp[[3]], 1, 0)
print(head(tricepsTest))
## [1] 1 1 0 1 1 1
# serum
serumTest <- ifelse(DTest$serum > disc$cutp[[4]],1,0)
print(head(serumTest))
## [1] 0 0 0 0 0 0
# nouveau data frame
DTestDisc <- data.frame(plasmaTest,tricepsTest,serumTest)
colnames(DTestDisc) <- c('plasma','triceps','serum')
str(DTestDisc)
## 'data.frame': 1160 obs. of 3 variables:
## $ plasma : num 0 1 0 0 0 0 0 1 0 0 ...
## $ triceps: num 1 1 0 1 1 1 0 1 1 1 ...
## $ serum : num 0 0 0 0 0 0 0 1 1 1 ...
#Q.23
# prédiction
pred3 <- factor(ifelse(predict(modele.disc,newdata=DTestDisc,type="response")>0.5,"positive","negative"))
print(summary(pred3))
## negative positive
## 928 232
# évaluation
caret::confusionMatrix(data=pred3,reference=DTest$diabete,positive="positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 692 236
## positive 70 162
##
## Accuracy : 0.7362
## 95% CI : (0.7098, 0.7614)
## No Information Rate : 0.6569
## P-Value [Acc > NIR] : 3.766e-09
##
## Kappa : 0.35
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4070
## Specificity : 0.9081
## Pos Pred Value : 0.6983
## Neg Pred Value : 0.7457
## Prevalence : 0.3431
## Detection Rate : 0.1397
## Detection Prevalence : 0.2000
## Balanced Accuracy : 0.6576
##
## 'Positive' Class : positive
##