# charger les packages
library(xlsx) #lecture directe des fichiers EXCEL

# Q.0 charger les données
donnees <- read.xlsx(file="infidelites.xlsx",header=T,sheetIndex=1,stringsAsFactors=TRUE,encoding="UTF-8")

# description
str(donnees)
## 'data.frame':    601 obs. of  9 variables:
##  $ Y             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sex           : num  1 0 0 1 1 0 0 1 0 1 ...
##  $ Age           : num  37 27 32 57 22 32 22 57 32 22 ...
##  $ YearsMarried  : num  10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
##  $ Children      : num  0 0 1 1 0 0 0 1 1 0 ...
##  $ Religious     : num  3 4 1 5 2 2 2 2 4 4 ...
##  $ Education     : num  18 14 12 18 17 17 12 14 16 14 ...
##  $ Occupation    : num  7 6 1 6 6 5 1 4 1 4 ...
##  $ RatingMarriage: num  4 4 4 5 3 5 3 4 2 5 ...
# summary
summary(donnees)
##        Y               Sex              Age         YearsMarried   
##  Min.   : 0.000   Min.   :0.0000   Min.   :17.50   Min.   : 0.125  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:27.00   1st Qu.: 4.000  
##  Median : 0.000   Median :0.0000   Median :32.00   Median : 7.000  
##  Mean   : 1.456   Mean   :0.4759   Mean   :32.49   Mean   : 8.178  
##  3rd Qu.: 0.000   3rd Qu.:1.0000   3rd Qu.:37.00   3rd Qu.:15.000  
##  Max.   :12.000   Max.   :1.0000   Max.   :57.00   Max.   :15.000  
##     Children        Religious       Education       Occupation   
##  Min.   :0.0000   Min.   :1.000   Min.   : 9.00   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:14.00   1st Qu.:3.000  
##  Median :1.0000   Median :3.000   Median :16.00   Median :5.000  
##  Mean   :0.7155   Mean   :3.116   Mean   :16.17   Mean   :4.195  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:18.00   3rd Qu.:6.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :20.00   Max.   :7.000  
##  RatingMarriage 
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.932  
##  3rd Qu.:5.000  
##  Max.   :5.000
# nombre d'observations
print(nrow(donnees))
## [1] 601
# nombre de variables 
print(ncol(donnees))
## [1] 9
# Q.1.a - Recoder la variable cible Y
Z <- factor(ifelse(donnees$Y == 0, 0, 1))
print(table(Z))
## Z
##   0   1 
## 451 150
# Q.1.b - Nouvelle base
infidelite <- cbind(Z,donnees[,2:9])
str(infidelite)
## 'data.frame':    601 obs. of  9 variables:
##  $ Z             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Sex           : num  1 0 0 1 1 0 0 1 0 1 ...
##  $ Age           : num  37 27 32 57 22 32 22 57 32 22 ...
##  $ YearsMarried  : num  10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
##  $ Children      : num  0 0 1 1 0 0 0 1 1 0 ...
##  $ Religious     : num  3 4 1 5 2 2 2 2 4 4 ...
##  $ Education     : num  18 14 12 18 17 17 12 14 16 14 ...
##  $ Occupation    : num  7 6 1 6 6 5 1 4 1 4 ...
##  $ RatingMarriage: num  4 4 4 5 3 5 3 4 2 5 ...
# Q.2 Différecienciation homme/femme

# comptabilisons les effectifs h/f
table(infidelite$Sex)
## 
##   0   1 
## 315 286
# Sex n'est pas siginificatif à 5%, pas de différenciation
print(summary(glm(Z ~ Sex, data = infidelite, family = binomial)))
## 
## Call:
## glm(formula = Z ~ Sex, family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7981  -0.7981  -0.7204  -0.7204   1.7181  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.2164     0.1342  -9.065   <2e-16 ***
## Sex           0.2356     0.1888   1.248    0.212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 673.82  on 599  degrees of freedom
## AIC: 677.82
## 
## Number of Fisher Scoring iterations: 4
# Q.3 - regression logistique
# variables significatives Ă  5% au sens de Wald :
# age, years married, religious, rating marriage
# signe du coefficient positif : plus de risque d'Ăªtre infidèle
# signe négatif : l'inverse
modele <- summary(glm(Z ~ ., data = infidelite, family = binomial))
print(modele)
## 
## Call:
## glm(formula = Z ~ ., family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5713  -0.7499  -0.5690  -0.2539   2.5191  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.37726    0.88776   1.551 0.120807    
## Sex             0.28029    0.23909   1.172 0.241083    
## Age            -0.04426    0.01825  -2.425 0.015301 *  
## YearsMarried    0.09477    0.03221   2.942 0.003262 ** 
## Children        0.39767    0.29151   1.364 0.172508    
## Religious      -0.32472    0.08975  -3.618 0.000297 ***
## Education       0.02105    0.05051   0.417 0.676851    
## Occupation      0.03092    0.07178   0.431 0.666630    
## RatingMarriage -0.46845    0.09091  -5.153 2.56e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 609.51  on 592  degrees of freedom
## AIC: 627.51
## 
## Number of Fisher Scoring iterations: 4
# Q.4 - Tester le retrait de Sex, Children, Education, Occupation
modele.2 <- summary(glm(Z ~ Age + YearsMarried+Religious+RatingMarriage, data = infidelite, family = binomial))
print(modele.2)
## 
## Call:
## glm(formula = Z ~ Age + YearsMarried + Religious + RatingMarriage, 
##     family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6278  -0.7550  -0.5701  -0.2624   2.3998  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.93083    0.61032   3.164 0.001558 ** 
## Age            -0.03527    0.01736  -2.032 0.042127 *  
## YearsMarried    0.10062    0.02921   3.445 0.000571 ***
## Religious      -0.32902    0.08945  -3.678 0.000235 ***
## RatingMarriage -0.46136    0.08884  -5.193 2.06e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 615.36  on 596  degrees of freedom
## AIC: 625.36
## 
## Number of Fisher Scoring iterations: 4
# Q.4.a - comparaison des AIC > le second modèle présente un AIC plus faible
# => le retrait des variables est justifié
print(modele$aic - modele.2$aic)
## [1] 2.152582
# Q.4.b - test du rapport de vraisemblance
# Ă  5%, les 4 variables prises en bloc ne sont pas significatives
# leur retrait est justifié
LR <- modele.2$deviance - modele$deviance
print(paste("Stat. de test =",LR))
## [1] "Stat. de test = 5.84741787095584"
print(paste("p-value =",pchisq(LR,4,lower.tail=F)))
## [1] "p-value = 0.21083665926769"
# Q.5 corrélation
print(cor(infidelite$Age,infidelite$YearsMarried))
## [1] 0.7775458
# mĂªme si ce n'est pas demandĂ©, un graphique est toujours intĂ©ressant
# argh, attention les ex-aequos
plot(infidelite$Age,infidelite$YearsMarried,type="p")

# Q.6.a - impact de l'Ă¢ge sur l'infidelitĂ©
print(summary(glm(Z ~ Age, data = infidelite, family = binomial)))
## 
## Call:
## glm(formula = Z ~ Age, family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8746  -0.7773  -0.7316  -0.6901   1.7616  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.557192   0.341628  -4.558 5.16e-06 ***
## Age          0.013920   0.009927   1.402    0.161    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 673.44  on 599  degrees of freedom
## AIC: 677.44
## 
## Number of Fisher Scoring iterations: 4
# Q.6.b - impact des années de mariage sur l'infidelité
print(summary(glm(Z ~ YearsMarried, data = infidelite, family = binomial)))
## 
## Call:
## glm(formula = Z ~ YearsMarried, family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8883  -0.7846  -0.6720  -0.6061   1.8894  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.60859    0.18327  -8.777  < 2e-16 ***
## YearsMarried  0.05882    0.01724   3.412 0.000646 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 663.50  on 599  degrees of freedom
## AIC: 667.5
## 
## Number of Fisher Scoring iterations: 4
# Q.6.c - prise en compte de Age et YearsMarried
print(summary(glm(Z ~ Age + YearsMarried, data = infidelite, family = binomial)))
## 
## Call:
## glm(formula = Z ~ Age + YearsMarried, family = binomial, data = infidelite)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0024  -0.7660  -0.6696  -0.4439   2.0237  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.90046    0.40000  -2.251 0.024376 *  
## Age          -0.03286    0.01657  -1.984 0.047309 *  
## YearsMarried  0.10168    0.02756   3.689 0.000225 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 659.40  on 598  degrees of freedom
## AIC: 665.4
## 
## Number of Fisher Scoring iterations: 4
# Q.6.d - graphique Age vs. YearsMarried, illustré par Z
plot(jitter(infidelite$Age,a=0.65),jitter(infidelite$YearsMarried,a=0.65),col=c("green","red")[infidelite$Z])

#*******************
#analyse Ă  3 classes
#*******************

# Q.7.a
G <- donnees$Y
G[donnees$Y==0] <- 1
G[donnees$Y>=1 & donnees$Y<=3] <- 2
G[donnees$Y>3] <- 3
G <- factor(G)
levels(G) <- c("jamais","un_peu","beaucoup")

#distribution
print(table(G))
## G
##   jamais   un_peu beaucoup 
##      451       70       80
# Q.7.b - avec un arbre de décision

# nouvelle base de données
infidelite.ter <- cbind(G,donnees[,2:9])

# arbre de décision avec rpart
library(rpart)
arbre <- rpart(G ~ ., data = infidelite.ter,control=rpart.control(minsplit=5,minbucket=2))
print(arbre)
## n= 601 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 601 150 jamais (0.7504160 0.1164725 0.1331115)  
##    2) RatingMarriage>=2.5 519 109 jamais (0.7899807 0.1098266 0.1001927) *
##    3) RatingMarriage< 2.5 82  41 jamais (0.5000000 0.1585366 0.3414634)  
##      6) Age>=49.5 10   0 jamais (1.0000000 0.0000000 0.0000000) *
##      7) Age< 49.5 72  41 jamais (0.4305556 0.1805556 0.3888889)  
##       14) Age< 29.5 27  11 jamais (0.5925926 0.2592593 0.1481481)  
##         28) Religious>=1.5 24   8 jamais (0.6666667 0.2083333 0.1250000) *
##         29) Religious< 1.5 3   1 un_peu (0.0000000 0.6666667 0.3333333) *
##       15) Age>=29.5 45  21 beaucoup (0.3333333 0.1333333 0.5333333)  
##         30) Occupation>=6.5 2   0 jamais (1.0000000 0.0000000 0.0000000) *
##         31) Occupation< 6.5 43  19 beaucoup (0.3023256 0.1395349 0.5581395) *
# affichage graphique plus sympathique
library(rpart.plot)
rpart.plot(arbre)

# Q.7.b - avec une analyse discriminante
library(MASS)
adl <- lda(G ~ ., data = infidelite.ter)
print(adl)
## Call:
## lda(G ~ ., data = infidelite.ter)
## 
## Prior probabilities of groups:
##    jamais    un_peu  beaucoup 
## 0.7504160 0.1164725 0.1331115 
## 
## Group means:
##                Sex      Age YearsMarried  Children Religious Education
## jamais   0.4611973 32.18071     7.727279 0.6807095  3.203991  16.13969
## un_peu   0.5714286 31.93571     8.297029 0.8142857  3.014286  16.11429
## beaucoup 0.4750000 34.70000    10.612500 0.8250000  2.712500  16.36250
##          Occupation RatingMarriage
## jamais     4.155211       4.093126
## un_peu     4.171429       3.671429
## beaucoup   4.437500       3.250000
## 
## Coefficients of linear discriminants:
##                        LD1         LD2
## Sex             0.09826803 -1.48354296
## Age            -0.04853526  0.04874528
## YearsMarried    0.13458676  0.07571924
## Children        0.16027444 -1.53691038
## Religious      -0.42823485 -0.11021099
## Education       0.03738695  0.08348897
## Occupation      0.06526249  0.12100552
## RatingMarriage -0.64709678  0.12319550
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8955 0.1045
projection <- predict(adl)
plot(projection$x[,1],projection$x[,2],col=c("green","blue","red")[unclass(G)])