#chargement rapide
data(mtcars)
#infos
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
#variables actives
X <- mtcars[c('mpg','cyl','disp','hp','drat','wt','qsec')]
#stat. descriptives
summary(X)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec
## Min. :2.760 Min. :1.513 Min. :14.50
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89
## Median :3.695 Median :3.325 Median :17.71
## Mean :3.597 Mean :3.217 Mean :17.85
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90
## Max. :4.930 Max. :5.424 Max. :22.90
#centrage réduction
Z <- scale(X)
summary(Z)
## mpg cyl disp hp
## Min. :-1.6079 Min. :-1.225 Min. :-1.2879 Min. :-1.3810
## 1st Qu.:-0.7741 1st Qu.:-1.225 1st Qu.:-0.8867 1st Qu.:-0.7320
## Median :-0.1478 Median :-0.105 Median :-0.2777 Median :-0.3455
## Mean : 0.0000 Mean : 0.000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4495 3rd Qu.: 1.015 3rd Qu.: 0.7688 3rd Qu.: 0.4859
## Max. : 2.2913 Max. : 1.015 Max. : 1.9468 Max. : 2.7466
## drat wt qsec
## Min. :-1.5646 Min. :-1.7418 Min. :-1.87401
## 1st Qu.:-0.9661 1st Qu.:-0.6500 1st Qu.:-0.53513
## Median : 0.1841 Median : 0.1101 Median :-0.07765
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6049 3rd Qu.: 0.4014 3rd Qu.: 0.58830
## Max. : 2.4939 Max. : 2.2553 Max. : 2.82675
#k-means
set.seed(0) #init. du générateur de nombre aléatoire
km <- kmeans(Z,centers=6)
#clusters d'appartenance des individus
print(cbind(row.names(Z),km$cluster))
## [,1] [,2]
## Mazda RX4 "Mazda RX4" "5"
## Mazda RX4 Wag "Mazda RX4 Wag" "5"
## Datsun 710 "Datsun 710" "5"
## Hornet 4 Drive "Hornet 4 Drive" "3"
## Hornet Sportabout "Hornet Sportabout" "1"
## Valiant "Valiant" "3"
## Duster 360 "Duster 360" "4"
## Merc 240D "Merc 240D" "3"
## Merc 230 "Merc 230" "3"
## Merc 280 "Merc 280" "5"
## Merc 280C "Merc 280C" "5"
## Merc 450SE "Merc 450SE" "1"
## Merc 450SL "Merc 450SL" "1"
## Merc 450SLC "Merc 450SLC" "1"
## Cadillac Fleetwood "Cadillac Fleetwood" "2"
## Lincoln Continental "Lincoln Continental" "2"
## Chrysler Imperial "Chrysler Imperial" "2"
## Fiat 128 "Fiat 128" "6"
## Honda Civic "Honda Civic" "6"
## Toyota Corolla "Toyota Corolla" "6"
## Toyota Corona "Toyota Corona" "3"
## Dodge Challenger "Dodge Challenger" "1"
## AMC Javelin "AMC Javelin" "1"
## Camaro Z28 "Camaro Z28" "4"
## Pontiac Firebird "Pontiac Firebird" "1"
## Fiat X1-9 "Fiat X1-9" "6"
## Porsche 914-2 "Porsche 914-2" "6"
## Lotus Europa "Lotus Europa" "6"
## Ford Pantera L "Ford Pantera L" "4"
## Ferrari Dino "Ferrari Dino" "5"
## Maserati Bora "Maserati Bora" "4"
## Volvo 142E "Volvo 142E" "5"
#effectifs par clusters
print(km$size)
## [1] 7 3 5 4 7 6
#vérification des effectifs
print(table(km$cluster))
##
## 1 2 3 4 5 6
## 7 3 5 4 7 6
#moyennes conditionnelles
#coordonnées des barycentres conditionnels
print(km$centers)
## mpg cyl disp hp drat wt
## 1 -0.54835560 1.0148821 0.6850701 0.3400164 -1.0222260 0.4816984
## 2 -1.37006186 1.0148821 1.8284157 1.0206569 -1.0159917 2.1691456
## 3 0.25707456 -0.7769098 -0.4244185 -0.7713723 -0.3115188 -0.1239195
## 4 -0.91101250 1.0148821 0.8857454 1.8313484 0.1467002 0.3273009
## 5 0.05370226 -0.4249507 -0.6752799 -0.3829905 0.5461390 -0.3322650
## 6 1.65523937 -1.2248578 -1.1624447 -1.0382807 1.2252295 -1.3738462
## qsec
## 1 -0.29589636
## 2 -0.06085812
## 3 1.49151352
## 4 -1.54523655
## 5 -0.12401465
## 6 0.30755500
#distance entre paires de barycentres
D <- dist(km$centers)
#cah sur matrice de distances
#méthode de ward -- attention, nous avons des données pondérées
#puisqu'il y a un premier regroupement ici (individu = sous-groupe)
cah <- hclust(D,method="ward.D2",members=km$size)
plot(cah,hang=-1)
#découpage en 2 groupes
groupes <- cutree(cah,k=2)
print(groupes)
## 1 2 3 4 5 6
## 1 1 2 1 2 2
#renommer les groupes en 'A','B'
groupes <- factor(groupes,labels=c("A","B"))
print(groupes)
## 1 2 3 4 5 6
## A A B A B B
## Levels: A B
#tableau des correspondances
tc <- t(sapply(names(groupes),function(g){v <- rep('.',2); v[groupes[g]] <- 'X'; return(v)}))
colnames(tc) <- c("A","B")
print(tc)
## A B
## 1 "X" "."
## 2 "X" "."
## 3 "." "X"
## 4 "X" "."
## 5 "." "X"
## 6 "." "X"
#effectifs
print(table(groupes))
## groupes
## A B
## 3 3
Comment calculer le groupe final d’appartenance des individus ?
#groupe final d'appartenance
final <- as.character(groupes[km$cluster])
print(cbind(rownames(mtcars),final))
## final
## [1,] "Mazda RX4" "B"
## [2,] "Mazda RX4 Wag" "B"
## [3,] "Datsun 710" "B"
## [4,] "Hornet 4 Drive" "B"
## [5,] "Hornet Sportabout" "A"
## [6,] "Valiant" "B"
## [7,] "Duster 360" "A"
## [8,] "Merc 240D" "B"
## [9,] "Merc 230" "B"
## [10,] "Merc 280" "B"
## [11,] "Merc 280C" "B"
## [12,] "Merc 450SE" "A"
## [13,] "Merc 450SL" "A"
## [14,] "Merc 450SLC" "A"
## [15,] "Cadillac Fleetwood" "A"
## [16,] "Lincoln Continental" "A"
## [17,] "Chrysler Imperial" "A"
## [18,] "Fiat 128" "B"
## [19,] "Honda Civic" "B"
## [20,] "Toyota Corolla" "B"
## [21,] "Toyota Corona" "B"
## [22,] "Dodge Challenger" "A"
## [23,] "AMC Javelin" "A"
## [24,] "Camaro Z28" "A"
## [25,] "Pontiac Firebird" "A"
## [26,] "Fiat X1-9" "B"
## [27,] "Porsche 914-2" "B"
## [28,] "Lotus Europa" "B"
## [29,] "Ford Pantera L" "A"
## [30,] "Ferrari Dino" "B"
## [31,] "Maserati Bora" "A"
## [32,] "Volvo 142E" "B"
#vérification des effectifs
print(table(final))
## final
## A B
## 14 18