Créer un compteur de temps pour les chunks.
knitr::knit_hooks$set(time_it = local({
now <- NULL
function(before, options) {
if (before) {
# record the current time before each chunk
now <<- Sys.time()
} else {
# calculate the time difference after a chunk
res <- difftime(Sys.time(), now, units = "secs")
# return a character string to show the time
paste("Time for this code chunk to run:", round(res*1000,0), "ms.")
}
}
}))
On peut mettre l’option pour mesurer le temps de traitement.
#package - 1.16.4
library(data.table)
#chargement avec décompression à la volée
#fread -- très efficace avec data.table
D <- data.table::fread("kddcup99twice.txt",header=TRUE)
#dimension
print(dim(D))
## [1] 9796862 42
Time for this code chunk to run: 2542 ms.
#liste des variables
print(colnames(D))
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" "V12"
## [13] "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24"
## [25] "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36"
## [37] "V37" "V38" "V39" "V40" "V41" "V42"
#premières lignes
head(D)
#type des colonnes
print(sapply(D,class))
## V1 V2 V3 V4 V5 V6
## "integer" "character" "character" "character" "integer" "integer"
## V7 V8 V9 V10 V11 V12
## "integer" "integer" "integer" "integer" "integer" "integer"
## V13 V14 V15 V16 V17 V18
## "integer" "integer" "integer" "integer" "integer" "integer"
## V19 V20 V21 V22 V23 V24
## "integer" "integer" "integer" "integer" "integer" "integer"
## V25 V26 V27 V28 V29 V30
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## V31 V32 V33 V34 V35 V36
## "numeric" "integer" "integer" "numeric" "numeric" "numeric"
## V37 V38 V39 V40 V41 V42
## "numeric" "numeric" "numeric" "numeric" "numeric" "character"
#comptage des lignes correspondant aux conditions
print(D[V42=="normal." & V2=="udp",.N])
## [1] 382696
Time for this code chunk to run: 81 ms.
#moyennes des variables numériques
print(D[,sapply(.SD,mean),,.SDcols=is.numeric])
## V1 V5 V6 V7 V8 V9
## 4.834243e+01 1.834621e+03 1.093623e+03 5.716116e-06 6.487792e-04 7.961733e-06
## V10 V11 V12 V13 V14 V15
## 1.243766e-02 3.205108e-05 1.435290e-01 8.088304e-03 6.818510e-05 3.674646e-05
## V16 V17 V18 V19 V20 V21
## 1.293496e-02 1.188748e-03 7.430951e-05 1.021143e-03 0.000000e+00 4.082940e-07
## V22 V23 V24 V25 V26 V27
## 8.351654e-04 3.349734e+02 2.952671e+02 1.779703e-01 1.780370e-01 5.766509e-02
## V28 V29 V30 V31 V32 V33
## 5.773010e-02 7.898842e-01 2.117961e-02 2.826080e-02 2.329811e+02 1.892142e+02
## V34 V35 V36 V37 V38 V39
## 7.537132e-01 3.071111e-02 6.050520e-01 6.464107e-03 1.780911e-01 1.778859e-01
## V40 V41
## 5.792780e-02 5.765941e-02
Time for this code chunk to run: 430 ms.
#requête, projection
print(D[V42=="normal.",sapply(.SD,mean),,.SDcols=is.numeric])
## V1 V5 V6 V7 V8 V9
## 2.178247e+02 1.477846e+03 3.234650e+03 7.195864e-06 0.000000e+00 3.597932e-05
## V10 V11 V12 V13 V14 V15
## 4.953530e-02 9.868614e-05 7.192677e-01 3.838891e-02 3.104501e-04 1.840085e-04
## V16 V17 V18 V19 V20 V21
## 6.497043e-02 5.887245e-03 3.628772e-04 5.130651e-03 0.000000e+00 2.055961e-06
## V22 V23 V24 V25 V26 V27
## 3.881655e-03 8.159029e+00 1.091279e+01 1.483057e-03 1.724581e-03 5.594070e-02
## V28 V29 V30 V31 V32 V33
## 5.620641e-02 9.852575e-01 1.853534e-02 1.324944e-01 1.484984e+02 2.020148e+02
## V34 V35 V36 V37 V38 V39
## 8.448792e-01 5.650019e-02 1.349402e-01 2.434030e-02 2.039328e-03 1.050185e-03
## V40 V41
## 5.778443e-02 5.601557e-02
Time for this code chunk to run: 293 ms.
#tri décroissant
#noter le caractère "-" devant V6 pour tri décroissant
res <- data.table::setorder(D,-V6)
head(res,1)
Time for this code chunk to run: 836 ms.
#calcul
res <- D[,.(.N),by=.(V42)]
#attention, N est un nom attribué
print(data.table::setorder(res,-N))
## V42 N
## <char> <int>
## 1: smurf. 5615772
## 2: neptune. 2144034
## 3: normal. 1945562
## 4: satan. 31784
## 5: ipsweep. 24962
## 6: portsweep. 20826
## 7: nmap. 4632
## 8: back. 4406
## 9: warezclient. 2040
## 10: teardrop. 1958
## 11: pod. 528
## 12: guess_passwd. 106
## 13: buffer_overflow. 60
## 14: land. 42
## 15: warezmaster. 40
## 16: imap. 24
## 17: rootkit. 20
## 18: loadmodule. 18
## 19: ftp_write. 16
## 20: multihop. 14
## 21: phf. 8
## 22: perl. 6
## 23: spy. 4
## V42 N
Time for this code chunk to run: 273 ms.
#sorte de group_by
res <- D[,.(mean(V34)),by=.(V42)]
#tri ensuite -- attention V1 est un nom attribué
print(data.table::setorder(res,V1))
## V42 V1
## <char> <num>
## 1: portsweep. 0.002928071
## 2: perl. 0.013333333
## 3: satan. 0.014214699
## 4: neptune. 0.043371318
## 5: spy. 0.185000000
## 6: teardrop. 0.246833504
## 7: rootkit. 0.306000000
## 8: nmap. 0.526735751
## 9: pod. 0.659734848
## 10: multihop. 0.715714286
## 11: warezclient. 0.735441176
## 12: loadmodule. 0.835555556
## 13: normal. 0.844879197
## 14: land. 0.870000000
## 15: ftp_write. 0.875000000
## 16: warezmaster. 0.900000000
## 17: imap. 0.916666667
## 18: ipsweep. 0.930468712
## 19: phf. 0.972500000
## 20: smurf. 0.999691383
## 21: buffer_overflow. 1.000000000
## 22: back. 1.000000000
## 23: guess_passwd. 1.000000000
## V42 V1
Time for this code chunk to run: 135 ms.
#on passe par un un group_by
print(D[,.(mean(V34)),by=.(V42,V2)])
## V42 V2 V1
## <char> <char> <num>
## 1: portsweep. tcp 0.0027346978
## 2: normal. tcp 0.9048007988
## 3: warezmaster. tcp 0.9000000000
## 4: multihop. tcp 0.7157142857
## 5: imap. tcp 0.9166666667
## 6: buffer_overflow. tcp 1.0000000000
## 7: ipsweep. tcp 0.0684090909
## 8: ftp_write. tcp 0.8750000000
## 9: rootkit. tcp 0.1514285714
## 10: satan. tcp 0.0007506892
## 11: loadmodule. tcp 0.8355555556
## 12: back. tcp 1.0000000000
## 13: phf. tcp 0.9725000000
## 14: perl. tcp 0.0133333333
## 15: warezclient. tcp 0.7354411765
## 16: guess_passwd. tcp 1.0000000000
## 17: spy. tcp 0.1850000000
## 18: neptune. tcp 0.0433713178
## 19: normal. udp 0.6258512239
## 20: satan. udp 0.1214988290
## 21: nmap. udp 0.7534400000
## 22: teardrop. udp 0.2468335036
## 23: rootkit. udp 0.6666666667
## 24: normal. icmp 0.5197696466
## 25: smurf. icmp 0.9996913835
## 26: pod. icmp 0.6597348485
## 27: land. tcp 0.8700000000
## 28: satan. icmp 0.2097297297
## 29: ipsweep. icmp 0.9993917107
## 30: nmap. icmp 0.9960077519
## 31: nmap. tcp 0.0035589942
## 32: portsweep. icmp 0.3383333333
## V42 V2 V1
Time for this code chunk to run: 145 ms.