# charger les packages
library(xlsx) #lecture directe des fichiers EXCEL
# Q.0 charger les données
donnees <- read.xlsx(file="infidelites.xlsx",header=T,sheetIndex=1,stringsAsFactors=TRUE,encoding="UTF-8")
# description
str(donnees)
## 'data.frame': 601 obs. of 9 variables:
## $ Y : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex : num 1 0 0 1 1 0 0 1 0 1 ...
## $ Age : num 37 27 32 57 22 32 22 57 32 22 ...
## $ YearsMarried : num 10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
## $ Children : num 0 0 1 1 0 0 0 1 1 0 ...
## $ Religious : num 3 4 1 5 2 2 2 2 4 4 ...
## $ Education : num 18 14 12 18 17 17 12 14 16 14 ...
## $ Occupation : num 7 6 1 6 6 5 1 4 1 4 ...
## $ RatingMarriage: num 4 4 4 5 3 5 3 4 2 5 ...
# summary
summary(donnees)
## Y Sex Age YearsMarried
## Min. : 0.000 Min. :0.0000 Min. :17.50 Min. : 0.125
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:27.00 1st Qu.: 4.000
## Median : 0.000 Median :0.0000 Median :32.00 Median : 7.000
## Mean : 1.456 Mean :0.4759 Mean :32.49 Mean : 8.178
## 3rd Qu.: 0.000 3rd Qu.:1.0000 3rd Qu.:37.00 3rd Qu.:15.000
## Max. :12.000 Max. :1.0000 Max. :57.00 Max. :15.000
## Children Religious Education Occupation
## Min. :0.0000 Min. :1.000 Min. : 9.00 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:14.00 1st Qu.:3.000
## Median :1.0000 Median :3.000 Median :16.00 Median :5.000
## Mean :0.7155 Mean :3.116 Mean :16.17 Mean :4.195
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:18.00 3rd Qu.:6.000
## Max. :1.0000 Max. :5.000 Max. :20.00 Max. :7.000
## RatingMarriage
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.932
## 3rd Qu.:5.000
## Max. :5.000
# nombre d'observations
print(nrow(donnees))
## [1] 601
# nombre de variables
print(ncol(donnees))
## [1] 9
# Q.1.a - Recoder la variable cible Y
Z <- factor(ifelse(donnees$Y == 0, 0, 1))
print(table(Z))
## Z
## 0 1
## 451 150
# Q.1.b - Nouvelle base
infidelite <- cbind(Z,donnees[,2:9])
str(infidelite)
## 'data.frame': 601 obs. of 9 variables:
## $ Z : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Sex : num 1 0 0 1 1 0 0 1 0 1 ...
## $ Age : num 37 27 32 57 22 32 22 57 32 22 ...
## $ YearsMarried : num 10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
## $ Children : num 0 0 1 1 0 0 0 1 1 0 ...
## $ Religious : num 3 4 1 5 2 2 2 2 4 4 ...
## $ Education : num 18 14 12 18 17 17 12 14 16 14 ...
## $ Occupation : num 7 6 1 6 6 5 1 4 1 4 ...
## $ RatingMarriage: num 4 4 4 5 3 5 3 4 2 5 ...
# Q.2 Différecienciation homme/femme
# comptabilisons les effectifs h/f
table(infidelite$Sex)
##
## 0 1
## 315 286
# Sex n'est pas siginificatif à 5%, pas de différenciation
print(summary(glm(Z ~ Sex, data = infidelite, family = binomial)))
##
## Call:
## glm(formula = Z ~ Sex, family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7981 -0.7981 -0.7204 -0.7204 1.7181
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2164 0.1342 -9.065 <2e-16 ***
## Sex 0.2356 0.1888 1.248 0.212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 673.82 on 599 degrees of freedom
## AIC: 677.82
##
## Number of Fisher Scoring iterations: 4
# Q.3 - regression logistique
# variables significatives Ă 5% au sens de Wald :
# age, years married, religious, rating marriage
# signe du coefficient positif : plus de risque d'Ăªtre infidèle
# signe négatif : l'inverse
modele <- summary(glm(Z ~ ., data = infidelite, family = binomial))
print(modele)
##
## Call:
## glm(formula = Z ~ ., family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5713 -0.7499 -0.5690 -0.2539 2.5191
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.37726 0.88776 1.551 0.120807
## Sex 0.28029 0.23909 1.172 0.241083
## Age -0.04426 0.01825 -2.425 0.015301 *
## YearsMarried 0.09477 0.03221 2.942 0.003262 **
## Children 0.39767 0.29151 1.364 0.172508
## Religious -0.32472 0.08975 -3.618 0.000297 ***
## Education 0.02105 0.05051 0.417 0.676851
## Occupation 0.03092 0.07178 0.431 0.666630
## RatingMarriage -0.46845 0.09091 -5.153 2.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 609.51 on 592 degrees of freedom
## AIC: 627.51
##
## Number of Fisher Scoring iterations: 4
# Q.4 - Tester le retrait de Sex, Children, Education, Occupation
modele.2 <- summary(glm(Z ~ Age + YearsMarried+Religious+RatingMarriage, data = infidelite, family = binomial))
print(modele.2)
##
## Call:
## glm(formula = Z ~ Age + YearsMarried + Religious + RatingMarriage,
## family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6278 -0.7550 -0.5701 -0.2624 2.3998
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.93083 0.61032 3.164 0.001558 **
## Age -0.03527 0.01736 -2.032 0.042127 *
## YearsMarried 0.10062 0.02921 3.445 0.000571 ***
## Religious -0.32902 0.08945 -3.678 0.000235 ***
## RatingMarriage -0.46136 0.08884 -5.193 2.06e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 615.36 on 596 degrees of freedom
## AIC: 625.36
##
## Number of Fisher Scoring iterations: 4
# Q.4.a - comparaison des AIC > le second modèle présente un AIC plus faible
# => le retrait des variables est justifié
print(modele$aic - modele.2$aic)
## [1] 2.152582
# Q.4.b - test du rapport de vraisemblance
# Ă 5%, les 4 variables prises en bloc ne sont pas significatives
# leur retrait est justifié
LR <- modele.2$deviance - modele$deviance
print(paste("Stat. de test =",LR))
## [1] "Stat. de test = 5.84741787095584"
print(paste("p-value =",pchisq(LR,4,lower.tail=F)))
## [1] "p-value = 0.21083665926769"
# Q.5 corrélation
print(cor(infidelite$Age,infidelite$YearsMarried))
## [1] 0.7775458
# mĂªme si ce n'est pas demandĂ©, un graphique est toujours intĂ©ressant
# argh, attention les ex-aequos
plot(infidelite$Age,infidelite$YearsMarried,type="p")
# Q.6.a - impact de l'Ă¢ge sur l'infidelitĂ©
print(summary(glm(Z ~ Age, data = infidelite, family = binomial)))
##
## Call:
## glm(formula = Z ~ Age, family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8746 -0.7773 -0.7316 -0.6901 1.7616
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.557192 0.341628 -4.558 5.16e-06 ***
## Age 0.013920 0.009927 1.402 0.161
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 673.44 on 599 degrees of freedom
## AIC: 677.44
##
## Number of Fisher Scoring iterations: 4
# Q.6.b - impact des années de mariage sur l'infidelité
print(summary(glm(Z ~ YearsMarried, data = infidelite, family = binomial)))
##
## Call:
## glm(formula = Z ~ YearsMarried, family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8883 -0.7846 -0.6720 -0.6061 1.8894
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
## YearsMarried 0.05882 0.01724 3.412 0.000646 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 663.50 on 599 degrees of freedom
## AIC: 667.5
##
## Number of Fisher Scoring iterations: 4
# Q.6.c - prise en compte de Age et YearsMarried
print(summary(glm(Z ~ Age + YearsMarried, data = infidelite, family = binomial)))
##
## Call:
## glm(formula = Z ~ Age + YearsMarried, family = binomial, data = infidelite)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0024 -0.7660 -0.6696 -0.4439 2.0237
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.90046 0.40000 -2.251 0.024376 *
## Age -0.03286 0.01657 -1.984 0.047309 *
## YearsMarried 0.10168 0.02756 3.689 0.000225 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 659.40 on 598 degrees of freedom
## AIC: 665.4
##
## Number of Fisher Scoring iterations: 4
# Q.6.d - graphique Age vs. YearsMarried, illustré par Z
plot(jitter(infidelite$Age,a=0.65),jitter(infidelite$YearsMarried,a=0.65),col=c("green","red")[infidelite$Z])
#*******************
#analyse Ă 3 classes
#*******************
# Q.7.a
G <- donnees$Y
G[donnees$Y==0] <- 1
G[donnees$Y>=1 & donnees$Y<=3] <- 2
G[donnees$Y>3] <- 3
G <- factor(G)
levels(G) <- c("jamais","un_peu","beaucoup")
#distribution
print(table(G))
## G
## jamais un_peu beaucoup
## 451 70 80
# Q.7.b - avec un arbre de décision
# nouvelle base de données
infidelite.ter <- cbind(G,donnees[,2:9])
# arbre de décision avec rpart
library(rpart)
arbre <- rpart(G ~ ., data = infidelite.ter,control=rpart.control(minsplit=5,minbucket=2))
print(arbre)
## n= 601
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 601 150 jamais (0.7504160 0.1164725 0.1331115)
## 2) RatingMarriage>=2.5 519 109 jamais (0.7899807 0.1098266 0.1001927) *
## 3) RatingMarriage< 2.5 82 41 jamais (0.5000000 0.1585366 0.3414634)
## 6) Age>=49.5 10 0 jamais (1.0000000 0.0000000 0.0000000) *
## 7) Age< 49.5 72 41 jamais (0.4305556 0.1805556 0.3888889)
## 14) Age< 29.5 27 11 jamais (0.5925926 0.2592593 0.1481481)
## 28) Religious>=1.5 24 8 jamais (0.6666667 0.2083333 0.1250000) *
## 29) Religious< 1.5 3 1 un_peu (0.0000000 0.6666667 0.3333333) *
## 15) Age>=29.5 45 21 beaucoup (0.3333333 0.1333333 0.5333333)
## 30) Occupation>=6.5 2 0 jamais (1.0000000 0.0000000 0.0000000) *
## 31) Occupation< 6.5 43 19 beaucoup (0.3023256 0.1395349 0.5581395) *
# affichage graphique plus sympathique
library(rpart.plot)
rpart.plot(arbre)
# Q.7.b - avec une analyse discriminante
library(MASS)
adl <- lda(G ~ ., data = infidelite.ter)
print(adl)
## Call:
## lda(G ~ ., data = infidelite.ter)
##
## Prior probabilities of groups:
## jamais un_peu beaucoup
## 0.7504160 0.1164725 0.1331115
##
## Group means:
## Sex Age YearsMarried Children Religious Education
## jamais 0.4611973 32.18071 7.727279 0.6807095 3.203991 16.13969
## un_peu 0.5714286 31.93571 8.297029 0.8142857 3.014286 16.11429
## beaucoup 0.4750000 34.70000 10.612500 0.8250000 2.712500 16.36250
## Occupation RatingMarriage
## jamais 4.155211 4.093126
## un_peu 4.171429 3.671429
## beaucoup 4.437500 3.250000
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sex 0.09826803 -1.48354296
## Age -0.04853526 0.04874528
## YearsMarried 0.13458676 0.07571924
## Children 0.16027444 -1.53691038
## Religious -0.42823485 -0.11021099
## Education 0.03738695 0.08348897
## Occupation 0.06526249 0.12100552
## RatingMarriage -0.64709678 0.12319550
##
## Proportion of trace:
## LD1 LD2
## 0.8955 0.1045
projection <- predict(adl)
plot(projection$x[,1],projection$x[,2],col=c("green","blue","red")[unclass(G)])