Versions¶
In [1]:
#version de datatable
import datatable as dt
dt.__version__
Out[1]:
'1.1.0'
Chargement¶
In [2]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [3]:
%%time
#importer les données
D = dt.fread("kddcup99twice.txt",sep=",")
display(D.head())
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | … | V38 | V39 | V40 | V41 | V42 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪ | ||
0 | 0 | tcp | http | SF | 215 | 45076 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
1 | 0 | tcp | http | SF | 215 | 45076 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
2 | 0 | tcp | http | SF | 162 | 4528 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
3 | 0 | tcp | http | SF | 162 | 4528 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
4 | 0 | tcp | http | SF | 236 | 1228 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
5 | 0 | tcp | http | SF | 236 | 1228 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
6 | 0 | tcp | http | SF | 233 | 2032 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
7 | 0 | tcp | http | SF | 233 | 2032 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
8 | 0 | tcp | http | SF | 239 | 486 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
9 | 0 | tcp | http | SF | 239 | 486 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | normal. |
CPU times: total: 7.67 s Wall time: 1.22 s
In [4]:
#type de l'objet
print(type(D))
<class 'datatable.Frame'>
In [5]:
# liste des variables
print(D.names)
('V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42')
In [6]:
#type des variables
print([D[:,col].type for col in D.names])
[Type.int32, Type.str32, Type.str32, Type.str32, Type.int32, Type.int32, Type.bool8, Type.int32, Type.int32, Type.int32, Type.int32, Type.bool8, Type.int32, Type.bool8, Type.int32, Type.int32, Type.int32, Type.int32, Type.int32, Type.bool8, Type.bool8, Type.bool8, Type.int32, Type.int32, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.int32, Type.int32, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.float64, Type.str32]
Filtrage¶
In [7]:
#f-expression - alias pour accéder aux colonnes
#voir https://datatable.readthedocs.io/en/latest/manual/f-expressions.html
from datatable import f
In [8]:
%%time
#requête - 382696 obs.
res = D[(f.V42=="normal.") & (f.V2=="udp"),:].shape[0]
display(res)
382696
CPU times: total: 891 ms Wall time: 164 ms
Moyennes pour variables numériques¶
In [9]:
%%time
#liste des variables numériques
numeric_columns = [col for col in D.names if D[:, col].type != dt.Type.str32]
# Calculer la moyenne des colonnes numériques
moyennes = D[:,numeric_columns].mean()
#on a un dataframe des moyennes par variable
#on fait la somme des moyennes pour contrôle
print(moyennes[:,dt.rowsum(f[float])])
| C0 | float64 -- + ------- 0 | 4032.38 [1 row x 1 column] CPU times: total: 2.69 s Wall time: 384 ms
Filtrage + moyennes¶
In [10]:
%%time
#liste des variables numériques
numeric_columns = [col for col in D.names if D[:, col].type != dt.Type.str32]
#Calculer la moyenne des colonnes numériques avec filtrage
moyennes = D[(f.V42 == 'normal.'),numeric_columns].mean()
print(moyennes[:,dt.rowsum(f[float])])
| C0 | float64 -- + ------- 0 | 5303.22 [1 row x 1 column] CPU times: total: 1.38 s Wall time: 239 ms
Tri selon une variable¶
In [11]:
%%time
#tri décroissant
#utilisation de la fonction sort() pour indiquer la colnne de référence
res = D[:,:,dt.sort('V6',reverse=True)]
display(res[0,:])
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | … | V38 | V39 | V40 | V41 | V42 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪▪▪▪▪ | ▪▪▪▪ | ||
0 | 10999 | tcp | other | RSTR | 0 | 1309937401 | 0 | 0 | 0 | 0 | … | 0 | 0 | 1 | 1 | portsweep. |
CPU times: total: 453 ms Wall time: 117 ms
Comptage de valeurs¶
In [12]:
%%time
#première partie: comptage, résultat = dataframe avec colonne "count"
res = D[:,dt.count(),dt.by("V42")]
#deuxième partie : tri du résultat sur la colonne "count"
display(res[:,:,dt.sort('count',reverse=True)])
V42 | count | |
---|---|---|
▪▪▪▪ | ▪▪▪▪▪▪▪▪ | |
0 | smurf. | 5615772 |
1 | neptune. | 2144034 |
2 | normal. | 1945562 |
3 | satan. | 31784 |
4 | ipsweep. | 24962 |
5 | portsweep. | 20826 |
6 | nmap. | 4632 |
7 | back. | 4406 |
8 | warezclient. | 2040 |
9 | teardrop. | 1958 |
10 | pod. | 528 |
11 | guess_passwd. | 106 |
12 | buffer_overflow. | 60 |
13 | land. | 42 |
14 | warezmaster. | 40 |
15 | imap. | 24 |
16 | rootkit. | 20 |
17 | loadmodule. | 18 |
18 | ftp_write. | 16 |
19 | multihop. | 14 |
20 | phf. | 8 |
21 | perl. | 6 |
22 | spy. | 4 |
CPU times: total: 1.84 s Wall time: 262 ms
Moyennes conditionnelles (1 critère)¶
In [13]:
%%time
#group by + moyenne -- utilisation de l'alias f
res = D[:,dt.mean(f.V34),dt.by('V42')]
#résultat = data frame -> tri sur les moyennes calculées V34 ensuite
display(res[:,:,dt.sort('V34')])
V42 | V34 | |
---|---|---|
▪▪▪▪ | ▪▪▪▪▪▪▪▪ | |
0 | portsweep. | 0.00292807 |
1 | perl. | 0.0133333 |
2 | satan. | 0.0142147 |
3 | neptune. | 0.0433713 |
4 | spy. | 0.185 |
5 | teardrop. | 0.246834 |
6 | rootkit. | 0.306 |
7 | nmap. | 0.526736 |
8 | pod. | 0.659735 |
9 | multihop. | 0.715714 |
10 | warezclient. | 0.735441 |
11 | loadmodule. | 0.835556 |
12 | normal. | 0.844879 |
13 | land. | 0.87 |
14 | ftp_write. | 0.875 |
15 | warezmaster. | 0.9 |
16 | imap. | 0.916667 |
17 | ipsweep. | 0.930469 |
18 | phf. | 0.9725 |
19 | smurf. | 0.999691 |
20 | back. | 1 |
21 | buffer_overflow. | 1 |
22 | guess_passwd. | 1 |
CPU times: total: 1.62 s Wall time: 313 ms
Moyennes conditionnelles (2 critères)¶
In [14]:
%%time
#dixit la doc. https://datatable.readthedocs.io/en/latest/manual/comparison_with_pandas.html
#datatable n'implémente pas l'équivant de pivot_table à ce jour
#on passe par une solution de type group_by
display(D[:,dt.mean(f.V34),dt.by('V42','V2')])
V42 | V2 | V34 | |
---|---|---|---|
▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪▪▪▪▪ | |
0 | back. | tcp | 1 |
1 | buffer_overflow. | tcp | 1 |
2 | ftp_write. | tcp | 0.875 |
3 | guess_passwd. | tcp | 1 |
4 | imap. | tcp | 0.916667 |
5 | ipsweep. | icmp | 0.999392 |
6 | ipsweep. | tcp | 0.0684091 |
7 | land. | tcp | 0.87 |
8 | loadmodule. | tcp | 0.835556 |
9 | multihop. | tcp | 0.715714 |
10 | neptune. | tcp | 0.0433713 |
11 | nmap. | icmp | 0.996008 |
12 | nmap. | tcp | 0.00355899 |
13 | nmap. | udp | 0.75344 |
14 | normal. | icmp | 0.51977 |
⋮ | ⋮ | ⋮ | ⋮ |
27 | smurf. | icmp | 0.999691 |
28 | spy. | tcp | 0.185 |
29 | teardrop. | udp | 0.246834 |
30 | warezclient. | tcp | 0.735441 |
31 | warezmaster. | tcp | 0.9 |
CPU times: total: 2.66 s Wall time: 459 ms