Versions¶
In [1]:
#pandas
import pandas
pandas.__version__
Out[1]:
'2.2.3'
In [2]:
#polars
import polars
polars.__version__
Out[2]:
'1.19.0'
Chargement¶
In [3]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [4]:
%%time
#importer les données avec Pandas
import pandas
dPandas = pandas.read_csv("kddcup99twice.txt")
display(dPandas.head())
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V33 | V34 | V35 | V36 | V37 | V38 | V39 | V40 | V41 | V42 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | tcp | http | SF | 215 | 45076 | 0 | 0 | 0 | 0 | ... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
1 | 0 | tcp | http | SF | 215 | 45076 | 0 | 0 | 0 | 0 | ... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
2 | 0 | tcp | http | SF | 162 | 4528 | 0 | 0 | 0 | 0 | ... | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
3 | 0 | tcp | http | SF | 162 | 4528 | 0 | 0 | 0 | 0 | ... | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
4 | 0 | tcp | http | SF | 236 | 1228 | 0 | 0 | 0 | 0 | ... | 2 | 1.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
5 rows × 42 columns
CPU times: total: 19.8 s Wall time: 21.2 s
In [5]:
%%time
#avec polars
dPolars = polars.read_csv("kddcup99twice.txt")
display(dPolars.head())
shape: (5, 42)
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V38 | V39 | V40 | V41 | V42 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | str | str | str | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | i64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str |
0 | "tcp" | "http" | "SF" | 215 | 45076 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | "normal." |
0 | "tcp" | "http" | "SF" | 215 | 45076 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | "normal." |
0 | "tcp" | "http" | "SF" | 162 | 4528 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | "normal." |
0 | "tcp" | "http" | "SF" | 162 | 4528 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | "normal." |
0 | "tcp" | "http" | "SF" | 236 | 1228 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2 | 2 | 1.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | "normal." |
CPU times: total: 10.5 s Wall time: 1.96 s
In [6]:
#types des objets
print(type(dPandas))
print(type(dPolars))
<class 'pandas.core.frame.DataFrame'> <class 'polars.dataframe.frame.DataFrame'>
Filtrage¶
In [7]:
%%time
#requête Pandas - 382696 obs.
res = dPandas.loc[(dPandas.V42=="normal.") & (dPandas.V2=="udp")].shape[0]
display(res)
382696
CPU times: total: 859 ms Wall time: 976 ms
In [8]:
%%time
#requête Polars - 382696 obs.
res = dPolars.filter((polars.col("V42")=="normal.") & (polars.col("V2")=="udp")).shape[0]
display(res)
382696
CPU times: total: 141 ms Wall time: 58.4 ms
Moyennes pour variables numériques¶
In [9]:
%%time
#moyennes avec Pandas
res = dPandas.mean(axis=0,numeric_only=True)
display(res.sum())
np.float64(4032.3814143100103)
CPU times: total: 609 ms Wall time: 645 ms
In [10]:
%%time
# Sélectionner uniquement les colonnes numériques
numeric_columns = dPolars.select(polars.col(polars.Float64, polars.Int64))
# Calculer les moyennes pour toutes les colonnes numériques
mean_df = numeric_columns.select(
[polars.col(col).mean().alias(f"mean_{col}") for col in numeric_columns.columns]
)
print(mean_df.sum_horizontal())
shape: (1,) Series: 'sum' [f64] [ 4032.381414 ] CPU times: total: 656 ms Wall time: 109 ms
Filtrage + moyennes¶
In [11]:
%%time
#que les variables quanti toujours
res = dPandas.loc[dPandas.V42=="normal."].mean(axis=0,numeric_only=True)
display(res.sum())
np.float64(5303.223380298341)
CPU times: total: 766 ms Wall time: 826 ms
In [12]:
%%time
#enchaînement filtrer les individus
#et sélectionner uniquement les colonnes numériques
fn_columns = (dPolars.filter(polars.col("V42")=="normal.")
.select(polars.col(polars.Float64, polars.Int64)))
# Calculer les moyennes pour toutes les colonnes numériques
mean_df = fn_columns.select(
[polars.col(col).mean().alias(f"mean_{col}") for col in numeric_columns.columns]
)
print(mean_df.sum_horizontal())
shape: (1,) Series: 'sum' [f64] [ 5303.22338 ] CPU times: total: 500 ms Wall time: 97.4 ms
Tri selon une variable¶
In [13]:
%%time
#tri selon Pandas
res = dPandas.sort_values(by='V6',ascending=False)
display(res.head(1))
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V33 | V34 | V35 | V36 | V37 | V38 | V39 | V40 | V41 | V42 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2835143 | 10999 | tcp | other | RSTR | 0 | 1309937401 | 0 | 0 | 0 | 0 | ... | 1 | 0.0 | 0.65 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | portsweep. |
1 rows × 42 columns
CPU times: total: 2.23 s Wall time: 2.32 s
In [14]:
%%time
#tri selon Polars
res = dPolars.sort(by=polars.col('V6'),descending=True)
display(res.head(1))
shape: (1, 42)
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V38 | V39 | V40 | V41 | V42 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | str | str | str | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | i64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str |
10999 | "tcp" | "other" | "RSTR" | 0 | 1309937401 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 255 | 1 | 0.0 | 0.65 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | "portsweep." |
CPU times: total: 22.6 s Wall time: 5.21 s
Comptage de valeurs¶
In [15]:
%%time
#pandas
display(dPandas.V42.value_counts().sort_values(ascending=False))
V42 smurf. 5615772 neptune. 2144034 normal. 1945562 satan. 31784 ipsweep. 24962 portsweep. 20826 nmap. 4632 back. 4406 warezclient. 2040 teardrop. 1958 pod. 528 guess_passwd. 106 buffer_overflow. 60 land. 42 warezmaster. 40 imap. 24 rootkit. 20 loadmodule. 18 ftp_write. 16 multihop. 14 phf. 8 perl. 6 spy. 4 Name: count, dtype: int64
CPU times: total: 328 ms Wall time: 509 ms
In [16]:
%%time
#polar
display(dPolars['V42'].value_counts().sort('count',descending=True))
shape: (23, 2)
V42 | count |
---|---|
str | u32 |
"smurf." | 5615772 |
"neptune." | 2144034 |
"normal." | 1945562 |
"satan." | 31784 |
"ipsweep." | 24962 |
… | … |
"ftp_write." | 16 |
"multihop." | 14 |
"phf." | 8 |
"perl." | 6 |
"spy." | 4 |
CPU times: total: 219 ms Wall time: 292 ms
Moyennes conditionnelles (1 critère)¶
In [17]:
%%time
#pandas
display(dPandas.groupby('V42')['V34'].mean().sort_values())
V42 portsweep. 0.002928 perl. 0.013333 satan. 0.014215 neptune. 0.043371 spy. 0.185000 teardrop. 0.246834 rootkit. 0.306000 nmap. 0.526736 pod. 0.659735 multihop. 0.715714 warezclient. 0.735441 loadmodule. 0.835556 normal. 0.844879 land. 0.870000 ftp_write. 0.875000 warezmaster. 0.900000 imap. 0.916667 ipsweep. 0.930469 phf. 0.972500 smurf. 0.999691 back. 1.000000 guess_passwd. 1.000000 buffer_overflow. 1.000000 Name: V34, dtype: float64
CPU times: total: 406 ms Wall time: 478 ms
In [18]:
%%time
#polars
res = dPolars.group_by('V42').agg(polars.col('V34').mean())
display(res.sort(by=polars.col("V34")))
shape: (23, 2)
V42 | V34 |
---|---|
str | f64 |
"portsweep." | 0.002928 |
"perl." | 0.013333 |
"satan." | 0.014215 |
"neptune." | 0.043371 |
"spy." | 0.185 |
… | … |
"phf." | 0.9725 |
"smurf." | 0.999691 |
"guess_passwd." | 1.0 |
"back." | 1.0 |
"buffer_overflow." | 1.0 |
CPU times: total: 4.2 s Wall time: 1.14 s
Moyennes conditionnelles (2 critères)¶
In [19]:
%%time
#pandas
display(pandas.pivot_table(dPandas,index='V2',columns='V42',values='V34',aggfunc='mean'))
V42 | back. | buffer_overflow. | ftp_write. | guess_passwd. | imap. | ipsweep. | land. | loadmodule. | multihop. | neptune. | ... | phf. | pod. | portsweep. | rootkit. | satan. | smurf. | spy. | teardrop. | warezclient. | warezmaster. |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
V2 | |||||||||||||||||||||
icmp | NaN | NaN | NaN | NaN | NaN | 0.999392 | NaN | NaN | NaN | NaN | ... | NaN | 0.659735 | 0.338333 | NaN | 0.209730 | 0.999691 | NaN | NaN | NaN | NaN |
tcp | 1.0 | 1.0 | 0.875 | 1.0 | 0.916667 | 0.068409 | 0.87 | 0.835556 | 0.715714 | 0.043371 | ... | 0.9725 | NaN | 0.002735 | 0.151429 | 0.000751 | NaN | 0.185 | NaN | 0.735441 | 0.9 |
udp | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 0.666667 | 0.121499 | NaN | NaN | 0.246834 | NaN | NaN |
3 rows × 23 columns
CPU times: total: 1.16 s Wall time: 1.43 s
In [20]:
%%time
#polars
display(dPolars.pivot('V42',index='V2',values='V34',aggregate_function='mean',sort_columns=True))
shape: (3, 24)
V2 | back. | buffer_overflow. | ftp_write. | guess_passwd. | imap. | ipsweep. | land. | loadmodule. | multihop. | neptune. | nmap. | normal. | perl. | phf. | pod. | portsweep. | rootkit. | satan. | smurf. | spy. | teardrop. | warezclient. | warezmaster. |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"tcp" | 1.0 | 1.0 | 0.875 | 1.0 | 0.916667 | 0.068409 | 0.87 | 0.835556 | 0.715714 | 0.043371 | 0.003559 | 0.904801 | 0.013333 | 0.9725 | null | 0.002735 | 0.151429 | 0.000751 | null | 0.185 | null | 0.735441 | 0.9 |
"udp" | null | null | null | null | null | null | null | null | null | null | 0.75344 | 0.625851 | null | null | null | null | 0.666667 | 0.121499 | null | null | 0.246834 | null | null |
"icmp" | null | null | null | null | null | 0.999392 | null | null | null | null | 0.996008 | 0.51977 | null | null | 0.659735 | 0.338333 | null | 0.20973 | 0.999691 | null | null | null | null |
CPU times: total: 1.53 s Wall time: 872 ms