Créer un compteur de temps pour les chunks.

knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      # record the current time before each chunk
      now <<- Sys.time()
    } else {
      # calculate the time difference after a chunk
      res <- difftime(Sys.time(), now, units = "secs")
      # return a character string to show the time
      paste("Time for this code chunk to run:", round(res*1000,0), "ms.")
    }
  }
}))

Chargement des données

On peut mettre l’option pour mesurer le temps de traitement.

#package - 1.16.4
library(data.table)

#chargement avec décompression à la volée
#fread -- très efficace avec data.table
D <- data.table::fread("kddcup99twice.txt",header=TRUE)

#dimension
print(dim(D))
## [1] 9796862      42

Time for this code chunk to run: 2542 ms.

#liste des variables
print(colnames(D))
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11" "V12"
## [13] "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24"
## [25] "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36"
## [37] "V37" "V38" "V39" "V40" "V41" "V42"
#premières lignes
head(D)
#type des colonnes
print(sapply(D,class))
##          V1          V2          V3          V4          V5          V6 
##   "integer" "character" "character" "character"   "integer"   "integer" 
##          V7          V8          V9         V10         V11         V12 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##         V13         V14         V15         V16         V17         V18 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##         V19         V20         V21         V22         V23         V24 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##         V25         V26         V27         V28         V29         V30 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##         V31         V32         V33         V34         V35         V36 
##   "numeric"   "integer"   "integer"   "numeric"   "numeric"   "numeric" 
##         V37         V38         V39         V40         V41         V42 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" "character"

Filtrage

#comptage des lignes correspondant aux conditions
print(D[V42=="normal." & V2=="udp",.N])
## [1] 382696

Time for this code chunk to run: 81 ms.

Moyennes des variables numériques

#moyennes des variables numériques
print(D[,sapply(.SD,mean),,.SDcols=is.numeric])
##           V1           V5           V6           V7           V8           V9 
## 4.834243e+01 1.834621e+03 1.093623e+03 5.716116e-06 6.487792e-04 7.961733e-06 
##          V10          V11          V12          V13          V14          V15 
## 1.243766e-02 3.205108e-05 1.435290e-01 8.088304e-03 6.818510e-05 3.674646e-05 
##          V16          V17          V18          V19          V20          V21 
## 1.293496e-02 1.188748e-03 7.430951e-05 1.021143e-03 0.000000e+00 4.082940e-07 
##          V22          V23          V24          V25          V26          V27 
## 8.351654e-04 3.349734e+02 2.952671e+02 1.779703e-01 1.780370e-01 5.766509e-02 
##          V28          V29          V30          V31          V32          V33 
## 5.773010e-02 7.898842e-01 2.117961e-02 2.826080e-02 2.329811e+02 1.892142e+02 
##          V34          V35          V36          V37          V38          V39 
## 7.537132e-01 3.071111e-02 6.050520e-01 6.464107e-03 1.780911e-01 1.778859e-01 
##          V40          V41 
## 5.792780e-02 5.765941e-02

Time for this code chunk to run: 430 ms.

Moyennes avec filtrage

#requête, projection
print(D[V42=="normal.",sapply(.SD,mean),,.SDcols=is.numeric])
##           V1           V5           V6           V7           V8           V9 
## 2.178247e+02 1.477846e+03 3.234650e+03 7.195864e-06 0.000000e+00 3.597932e-05 
##          V10          V11          V12          V13          V14          V15 
## 4.953530e-02 9.868614e-05 7.192677e-01 3.838891e-02 3.104501e-04 1.840085e-04 
##          V16          V17          V18          V19          V20          V21 
## 6.497043e-02 5.887245e-03 3.628772e-04 5.130651e-03 0.000000e+00 2.055961e-06 
##          V22          V23          V24          V25          V26          V27 
## 3.881655e-03 8.159029e+00 1.091279e+01 1.483057e-03 1.724581e-03 5.594070e-02 
##          V28          V29          V30          V31          V32          V33 
## 5.620641e-02 9.852575e-01 1.853534e-02 1.324944e-01 1.484984e+02 2.020148e+02 
##          V34          V35          V36          V37          V38          V39 
## 8.448792e-01 5.650019e-02 1.349402e-01 2.434030e-02 2.039328e-03 1.050185e-03 
##          V40          V41 
## 5.778443e-02 5.601557e-02

Time for this code chunk to run: 293 ms.

Tri selon une variable

#tri décroissant
#noter le caractère "-" devant V6 pour tri décroissant
res <- data.table::setorder(D,-V6)
head(res,1)

Time for this code chunk to run: 836 ms.

Comptage des valeurs

#calcul
res <- D[,.(.N),by=.(V42)]
#attention, N est un nom attribué
print(data.table::setorder(res,-N))
##                  V42       N
##               <char>   <int>
##  1:           smurf. 5615772
##  2:         neptune. 2144034
##  3:          normal. 1945562
##  4:           satan.   31784
##  5:         ipsweep.   24962
##  6:       portsweep.   20826
##  7:            nmap.    4632
##  8:            back.    4406
##  9:     warezclient.    2040
## 10:        teardrop.    1958
## 11:             pod.     528
## 12:    guess_passwd.     106
## 13: buffer_overflow.      60
## 14:            land.      42
## 15:     warezmaster.      40
## 16:            imap.      24
## 17:         rootkit.      20
## 18:      loadmodule.      18
## 19:       ftp_write.      16
## 20:        multihop.      14
## 21:             phf.       8
## 22:            perl.       6
## 23:             spy.       4
##                  V42       N

Time for this code chunk to run: 273 ms.

Moyennes conditionnelles (1 critère)

#sorte de group_by
res <- D[,.(mean(V34)),by=.(V42)]
#tri ensuite -- attention V1 est un nom attribué
print(data.table::setorder(res,V1))
##                  V42          V1
##               <char>       <num>
##  1:       portsweep. 0.002928071
##  2:            perl. 0.013333333
##  3:           satan. 0.014214699
##  4:         neptune. 0.043371318
##  5:             spy. 0.185000000
##  6:        teardrop. 0.246833504
##  7:         rootkit. 0.306000000
##  8:            nmap. 0.526735751
##  9:             pod. 0.659734848
## 10:        multihop. 0.715714286
## 11:     warezclient. 0.735441176
## 12:      loadmodule. 0.835555556
## 13:          normal. 0.844879197
## 14:            land. 0.870000000
## 15:       ftp_write. 0.875000000
## 16:     warezmaster. 0.900000000
## 17:            imap. 0.916666667
## 18:         ipsweep. 0.930468712
## 19:             phf. 0.972500000
## 20:           smurf. 0.999691383
## 21: buffer_overflow. 1.000000000
## 22:            back. 1.000000000
## 23:    guess_passwd. 1.000000000
##                  V42          V1

Time for this code chunk to run: 135 ms.

Moyennes conditionnelles (2 critères)

#on passe par un un group_by
print(D[,.(mean(V34)),by=.(V42,V2)])
##                  V42     V2           V1
##               <char> <char>        <num>
##  1:       portsweep.    tcp 0.0027346978
##  2:          normal.    tcp 0.9048007988
##  3:     warezmaster.    tcp 0.9000000000
##  4:        multihop.    tcp 0.7157142857
##  5:            imap.    tcp 0.9166666667
##  6: buffer_overflow.    tcp 1.0000000000
##  7:         ipsweep.    tcp 0.0684090909
##  8:       ftp_write.    tcp 0.8750000000
##  9:         rootkit.    tcp 0.1514285714
## 10:           satan.    tcp 0.0007506892
## 11:      loadmodule.    tcp 0.8355555556
## 12:            back.    tcp 1.0000000000
## 13:             phf.    tcp 0.9725000000
## 14:            perl.    tcp 0.0133333333
## 15:     warezclient.    tcp 0.7354411765
## 16:    guess_passwd.    tcp 1.0000000000
## 17:             spy.    tcp 0.1850000000
## 18:         neptune.    tcp 0.0433713178
## 19:          normal.    udp 0.6258512239
## 20:           satan.    udp 0.1214988290
## 21:            nmap.    udp 0.7534400000
## 22:        teardrop.    udp 0.2468335036
## 23:         rootkit.    udp 0.6666666667
## 24:          normal.   icmp 0.5197696466
## 25:           smurf.   icmp 0.9996913835
## 26:             pod.   icmp 0.6597348485
## 27:            land.    tcp 0.8700000000
## 28:           satan.   icmp 0.2097297297
## 29:         ipsweep.   icmp 0.9993917107
## 30:            nmap.   icmp 0.9960077519
## 31:            nmap.    tcp 0.0035589942
## 32:       portsweep.   icmp 0.3383333333
##                  V42     V2           V1

Time for this code chunk to run: 145 ms.