Importation et inspection des données

library(readxl)
D <- readxl::read_excel("heart_afdm_clustering.xlsx",sheet="var actives")
str(D)
## tibble [270 × 11] (S3: tbl_df/tbl/data.frame)
##  $ age         : num [1:270] 70 67 57 64 74 65 56 59 60 63 ...
##  $ sexe        : chr [1:270] "masculin" "feminin" "masculin" "masculin" ...
##  $ type_douleur: chr [1:270] "D" "C" "B" "D" ...
##  $ pression    : num [1:270] 130 115 124 128 120 120 130 110 140 150 ...
##  $ cholester   : num [1:270] 322 564 261 263 269 177 256 239 293 407 ...
##  $ sucre       : chr [1:270] "A" "A" "A" "A" ...
##  $ taux_max    : num [1:270] 109 160 141 105 121 140 142 142 170 154 ...
##  $ angine      : chr [1:270] "non" "non" "non" "oui" ...
##  $ depression  : num [1:270] 24 16 3 2 2 4 6 12 12 40 ...
##  $ pic         : num [1:270] 2 2 1 2 1 1 2 2 2 2 ...
##  $ vaisseau    : chr [1:270] "D" "A" "A" "B" ...
#premières lignes
head(D)

AFDM (analyse factorielle des données mixtes)

#factorminer
library(FactoMineR)

#analyse factorielle des données mixtes
afdm <- FAMD(D,ncp=5)
## Warning: ggrepel: 181 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 181 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

summary(afdm)
## 
## Call:
## FAMD(base = D, ncp = 5) 
## 
## 
## Eigenvalues
##                       Dim.1  Dim.2  Dim.3  Dim.4  Dim.5
## Variance              2.857  1.546  1.350  1.228  1.129
## % of var.            19.043 10.308  9.001  8.190  7.525
## Cumulative % of var. 19.043 29.351 38.353 46.542 54.068
## 
## Individuals (the 10 first)
##                  Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 1            |  5.029 |  2.924  1.108  0.338 |  0.827  0.164  0.027 | -0.776
## 2            |  6.829 |  0.275  0.010  0.002 |  3.486  2.911  0.261 | -2.131
## 3            |  3.006 | -1.482  0.285  0.243 | -0.157  0.006  0.003 | -0.672
## 4            |  3.658 |  2.185  0.619  0.357 | -1.284  0.395  0.123 | -1.673
## 5            |  4.645 |  0.417  0.023  0.008 |  0.372  0.033  0.006 | -2.939
## 6            |  2.841 | -0.422  0.023  0.022 | -0.912  0.199  0.103 | -0.307
## 7            |  3.887 |  0.911  0.108  0.055 | -0.026  0.000  0.000 | -0.077
## 8            |  3.122 |  1.464  0.278  0.220 | -1.916  0.879  0.377 | -1.041
## 9            |  3.461 |  0.889  0.103  0.066 |  0.823  0.162  0.057 | -0.318
## 10           |  5.959 |  2.631  0.897  0.195 |  2.850  1.945  0.229 | -0.645
##                 ctr   cos2  
## 1             0.165  0.024 |
## 2             1.245  0.097 |
## 3             0.124  0.050 |
## 4             0.768  0.209 |
## 5             2.369  0.400 |
## 6             0.026  0.012 |
## 7             0.002  0.000 |
## 8             0.297  0.111 |
## 9             0.028  0.008 |
## 10            0.114  0.012 |
## 
## Continuous variables
##                 Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr
## age          |  0.537 10.082  0.288 |  0.462 13.818  0.214 | -0.257  4.889
## pression     |  0.318  3.541  0.101 |  0.553 19.767  0.306 |  0.246  4.478
## cholester    |  0.174  1.063  0.030 |  0.505 16.525  0.256 | -0.426 13.459
## taux_max     | -0.711 17.696  0.505 |  0.178  2.044  0.032 |  0.195  2.814
## depression   |  0.705 17.420  0.498 | -0.026  0.042  0.001 |  0.378 10.597
## pic          |  0.646 14.595  0.417 | -0.075  0.360  0.006 |  0.399 11.768
##                cos2  
## age           0.066 |
## pression      0.060 |
## cholester     0.182 |
## taux_max      0.038 |
## depression    0.143 |
## pic           0.159 |
## 
## Categories (the 10 first)
##                 Dim.1    ctr   cos2 v.test    Dim.2    ctr   cos2 v.test  
## feminin      | -0.378  0.564  0.059 -2.528 |  0.933 11.743  0.360  8.489 |
## masculin     |  0.180  0.268  0.059  2.528 | -0.444  5.583  0.360 -8.489 |
## A            |  0.191  0.033  0.003  0.525 |  0.822  2.091  0.050  3.065 |
## B            | -1.702  5.520  0.422 -7.087 |  0.094  0.057  0.001  0.530 |
## C            | -0.730  1.913  0.186 -4.558 |  0.378  1.749  0.050  3.207 |
## D            |  0.972  5.528  0.566  9.018 | -0.389  3.030  0.091 -4.912 |
## A            | -0.051  0.027  0.014 -1.191 | -0.200  1.424  0.211 -6.323 |
## B            |  0.294  0.157  0.014  1.191 |  1.149  8.188  0.211  6.323 |
## non          | -0.699  4.012  0.630 -9.671 |  0.294  2.422  0.111  5.528 |
## oui          |  1.421  8.159  0.630  9.671 | -0.598  4.925  0.111 -5.528 |
##               Dim.3    ctr   cos2 v.test  
## feminin      -0.415  3.049  0.071 -4.042 |
## masculin      0.197  1.450  0.071  4.042 |
## A             2.462 24.626  0.448  9.828 |
## B            -0.373  1.189  0.020 -2.262 |
## C             0.211  0.711  0.015  1.911 |
## D            -0.389  3.967  0.091 -5.253 |
## A            -0.100  0.470  0.053 -3.396 |
## B             0.577  2.704  0.053  3.396 |
## non           0.134  0.660  0.023  2.698 |
## oui          -0.273  1.343  0.023 -2.698 |
#coordonnées des 10 premiers individus
head(afdm$ind$coord,10)
##         Dim.1       Dim.2       Dim.3      Dim.4      Dim.5
## 1   2.9236614  0.82727957 -0.77636193 -0.7658230 -1.2464301
## 2   0.2746058  3.48618681 -2.13077617  0.8115838 -2.2266582
## 3  -1.4820922 -0.15669953 -0.67202499  0.3646365  0.5057802
## 4   2.1850086 -1.28415069 -1.67272630 -0.9667281  0.6056009
## 5   0.4167746  0.37219755 -2.93907227 -0.5645674  0.6458562
## 6  -0.4223041 -0.91159880 -0.30716132 -0.2766911  0.3131758
## 7   0.9108194 -0.02589533 -0.07672574 -2.7319840  0.9436402
## 8   1.4644450 -1.91575936 -1.04091107 -0.6474286  0.4982558
## 9   0.8892931  0.82306108 -0.31807057  1.5288849  1.0831128
## 10  2.6305767  2.84977754 -0.64497428  0.7304879 -2.2293733
#nuages par paires de variables
pairs(afdm$ind$coord,cex=0.75)

Matrice des distances + CAH

#matrice des distances à partir des 5 premiers facteur
md <- dist(afdm$ind$coord)

#cah
cah <- hclust(md,method="ward.D2")
plot(cah)

#découpage en 2 groupes
groupes <- cutree(cah,k=2)
table(groupes)
## groupes
##   1   2 
##  77 193
#nuage avec les groupes d'appartenance
pairs(afdm$ind$coord,col=c('cornflowerblue','yellowgreen')[groupes],cex=0.75)

Interprétation

Variables quantitatives.

#moyennes marginales var. quanti
sapply(D[sapply(D,is.numeric)],mean)
##        age   pression  cholester   taux_max depression        pic 
##  54.433333 131.344444 249.659259 149.677778  10.500000   1.585185
#moyennes conditionnelles var. quanti
lapply(D[sapply(D,is.numeric)],function(x){tapply(x,groupes,mean)})
## $age
##        1        2 
## 58.97403 52.62176 
## 
## $pression
##        1        2 
## 134.4416 130.1088 
## 
## $cholester
##        1        2 
## 258.4675 246.1451 
## 
## $taux_max
##        1        2 
## 125.7273 159.2332 
## 
## $depression
##         1         2 
## 19.896104  6.751295 
## 
## $pic
##        1        2 
## 2.025974 1.409326

Variables qualitatives (catégorielles).

#proportions marginales var. qualitatives
sapply(D[sapply(D,is.character)],function(x){prop.table(table(x))})
## $sexe
## x
##   feminin  masculin 
## 0.3222222 0.6777778 
## 
## $type_douleur
## x
##          A          B          C          D 
## 0.07407407 0.15555556 0.29259259 0.47777778 
## 
## $sucre
## x
##         A         B 
## 0.8518519 0.1481481 
## 
## $angine
## x
##       non       oui 
## 0.6703704 0.3296296 
## 
## $vaisseau
## x
##          A          B          C          D 
## 0.59259259 0.21481481 0.12222222 0.07037037
#proportions dans les classes
sapply(D[sapply(D,is.character)],function(x){prop.table(table(groupes,x),1)})
## $sexe
##        x
## groupes   feminin  masculin
##       1 0.2597403 0.7402597
##       2 0.3471503 0.6528497
## 
## $type_douleur
##        x
## groupes          A          B          C          D
##       1 0.00000000 0.03896104 0.09090909 0.87012987
##       2 0.10362694 0.20207254 0.37305699 0.32124352
## 
## $sucre
##        x
## groupes          A          B
##       1 0.93506494 0.06493506
##       2 0.81865285 0.18134715
## 
## $angine
##        x
## groupes       non       oui
##       1 0.2727273 0.7272727
##       2 0.8290155 0.1709845
## 
## $vaisseau
##        x
## groupes          A          B          C          D
##       1 0.29870130 0.32467532 0.19480519 0.18181818
##       2 0.70984456 0.17098446 0.09326425 0.02590674

Variable illustrative (coeur)

#chargement variable illustrative
DIllus <- readxl::read_excel("heart_afdm_clustering.xlsx",sheet="var illustrative")
head(DIllus)
#proportions des modalités
prop.table(table(DIllus$coeur))
## 
##   absence  presence 
## 0.5555556 0.4444444
#croisement entre les classes réelles et les groupes clustering
table(DIllus$coeur,groupes)
##           groupes
##              1   2
##   absence    9 141
##   presence  68  52