Importation des données

#chargement rapide
data(mtcars)

#infos
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
#variables actives
X <- mtcars[c('mpg','cyl','disp','hp','drat','wt','qsec')]

#stat. descriptives
summary(X)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec      
##  Min.   :2.760   Min.   :1.513   Min.   :14.50  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89  
##  Median :3.695   Median :3.325   Median :17.71  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90
#centrage réduction
Z <- scale(X)
summary(Z)
##       mpg               cyl              disp               hp         
##  Min.   :-1.6079   Min.   :-1.225   Min.   :-1.2879   Min.   :-1.3810  
##  1st Qu.:-0.7741   1st Qu.:-1.225   1st Qu.:-0.8867   1st Qu.:-0.7320  
##  Median :-0.1478   Median :-0.105   Median :-0.2777   Median :-0.3455  
##  Mean   : 0.0000   Mean   : 0.000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4495   3rd Qu.: 1.015   3rd Qu.: 0.7688   3rd Qu.: 0.4859  
##  Max.   : 2.2913   Max.   : 1.015   Max.   : 1.9468   Max.   : 2.7466  
##       drat               wt               qsec         
##  Min.   :-1.5646   Min.   :-1.7418   Min.   :-1.87401  
##  1st Qu.:-0.9661   1st Qu.:-0.6500   1st Qu.:-0.53513  
##  Median : 0.1841   Median : 0.1101   Median :-0.07765  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6049   3rd Qu.: 0.4014   3rd Qu.: 0.58830  
##  Max.   : 2.4939   Max.   : 2.2553   Max.   : 2.82675

K-Means en 6 clusters

#k-means
set.seed(0) #init. du générateur de nombre aléatoire
km <- kmeans(Z,centers=6)

#clusters d'appartenance des individus
print(cbind(row.names(Z),km$cluster))
##                     [,1]                  [,2]
## Mazda RX4           "Mazda RX4"           "5" 
## Mazda RX4 Wag       "Mazda RX4 Wag"       "5" 
## Datsun 710          "Datsun 710"          "5" 
## Hornet 4 Drive      "Hornet 4 Drive"      "3" 
## Hornet Sportabout   "Hornet Sportabout"   "1" 
## Valiant             "Valiant"             "3" 
## Duster 360          "Duster 360"          "4" 
## Merc 240D           "Merc 240D"           "3" 
## Merc 230            "Merc 230"            "3" 
## Merc 280            "Merc 280"            "5" 
## Merc 280C           "Merc 280C"           "5" 
## Merc 450SE          "Merc 450SE"          "1" 
## Merc 450SL          "Merc 450SL"          "1" 
## Merc 450SLC         "Merc 450SLC"         "1" 
## Cadillac Fleetwood  "Cadillac Fleetwood"  "2" 
## Lincoln Continental "Lincoln Continental" "2" 
## Chrysler Imperial   "Chrysler Imperial"   "2" 
## Fiat 128            "Fiat 128"            "6" 
## Honda Civic         "Honda Civic"         "6" 
## Toyota Corolla      "Toyota Corolla"      "6" 
## Toyota Corona       "Toyota Corona"       "3" 
## Dodge Challenger    "Dodge Challenger"    "1" 
## AMC Javelin         "AMC Javelin"         "1" 
## Camaro Z28          "Camaro Z28"          "4" 
## Pontiac Firebird    "Pontiac Firebird"    "1" 
## Fiat X1-9           "Fiat X1-9"           "6" 
## Porsche 914-2       "Porsche 914-2"       "6" 
## Lotus Europa        "Lotus Europa"        "6" 
## Ford Pantera L      "Ford Pantera L"      "4" 
## Ferrari Dino        "Ferrari Dino"        "5" 
## Maserati Bora       "Maserati Bora"       "4" 
## Volvo 142E          "Volvo 142E"          "5"
#effectifs par clusters
print(km$size)
## [1] 7 3 5 4 7 6
#vérification des effectifs
print(table(km$cluster))
## 
## 1 2 3 4 5 6 
## 7 3 5 4 7 6
#moyennes conditionnelles
#coordonnées des barycentres conditionnels
print(km$centers)
##           mpg        cyl       disp         hp       drat         wt
## 1 -0.54835560  1.0148821  0.6850701  0.3400164 -1.0222260  0.4816984
## 2 -1.37006186  1.0148821  1.8284157  1.0206569 -1.0159917  2.1691456
## 3  0.25707456 -0.7769098 -0.4244185 -0.7713723 -0.3115188 -0.1239195
## 4 -0.91101250  1.0148821  0.8857454  1.8313484  0.1467002  0.3273009
## 5  0.05370226 -0.4249507 -0.6752799 -0.3829905  0.5461390 -0.3322650
## 6  1.65523937 -1.2248578 -1.1624447 -1.0382807  1.2252295 -1.3738462
##          qsec
## 1 -0.29589636
## 2 -0.06085812
## 3  1.49151352
## 4 -1.54523655
## 5 -0.12401465
## 6  0.30755500

CAH à partir des clusters K-Means

#distance entre paires de barycentres
D <- dist(km$centers)
#cah sur matrice de distances
#méthode de ward -- attention, nous avons des données pondérées
#puisqu'il y a un premier regroupement ici (individu = sous-groupe)
cah <- hclust(D,method="ward.D2",members=km$size)
plot(cah,hang=-1)

#découpage en 2 groupes
groupes <- cutree(cah,k=2)
print(groupes)
## 1 2 3 4 5 6 
## 1 1 2 1 2 2
#renommer les groupes en 'A','B'
groupes <- factor(groupes,labels=c("A","B"))
print(groupes)
## 1 2 3 4 5 6 
## A A B A B B 
## Levels: A B
#tableau des correspondances
tc <- t(sapply(names(groupes),function(g){v <- rep('.',2); v[groupes[g]] <- 'X'; return(v)}))
colnames(tc) <- c("A","B")
print(tc)
##   A   B  
## 1 "X" "."
## 2 "X" "."
## 3 "." "X"
## 4 "X" "."
## 5 "." "X"
## 6 "." "X"
#effectifs
print(table(groupes))
## groupes
## A B 
## 3 3

Appariement des résultats

Comment calculer le groupe final d’appartenance des individus ?

#groupe final d'appartenance
final <- as.character(groupes[km$cluster])
print(cbind(rownames(mtcars),final))
##                             final
##  [1,] "Mazda RX4"           "B"  
##  [2,] "Mazda RX4 Wag"       "B"  
##  [3,] "Datsun 710"          "B"  
##  [4,] "Hornet 4 Drive"      "B"  
##  [5,] "Hornet Sportabout"   "A"  
##  [6,] "Valiant"             "B"  
##  [7,] "Duster 360"          "A"  
##  [8,] "Merc 240D"           "B"  
##  [9,] "Merc 230"            "B"  
## [10,] "Merc 280"            "B"  
## [11,] "Merc 280C"           "B"  
## [12,] "Merc 450SE"          "A"  
## [13,] "Merc 450SL"          "A"  
## [14,] "Merc 450SLC"         "A"  
## [15,] "Cadillac Fleetwood"  "A"  
## [16,] "Lincoln Continental" "A"  
## [17,] "Chrysler Imperial"   "A"  
## [18,] "Fiat 128"            "B"  
## [19,] "Honda Civic"         "B"  
## [20,] "Toyota Corolla"      "B"  
## [21,] "Toyota Corona"       "B"  
## [22,] "Dodge Challenger"    "A"  
## [23,] "AMC Javelin"         "A"  
## [24,] "Camaro Z28"          "A"  
## [25,] "Pontiac Firebird"    "A"  
## [26,] "Fiat X1-9"           "B"  
## [27,] "Porsche 914-2"       "B"  
## [28,] "Lotus Europa"        "B"  
## [29,] "Ford Pantera L"      "A"  
## [30,] "Ferrari Dino"        "B"  
## [31,] "Maserati Bora"       "A"  
## [32,] "Volvo 142E"          "B"
#vérification des effectifs
print(table(final))
## final
##  A  B 
## 14 18