Versions¶

In [1]:
#pandas
import pandas
pandas.__version__
Out[1]:
'2.2.3'
In [2]:
#polars
import polars
polars.__version__
Out[2]:
'1.19.0'

Chargement¶

In [3]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [4]:
%%time

#importer les données avec Pandas
import pandas
dPandas = pandas.read_csv("kddcup99twice.txt")
display(dPandas.head())
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ... V33 V34 V35 V36 V37 V38 V39 V40 V41 V42
0 0 tcp http SF 215 45076 0 0 0 0 ... 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 normal.
1 0 tcp http SF 215 45076 0 0 0 0 ... 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 normal.
2 0 tcp http SF 162 4528 0 0 0 0 ... 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 normal.
3 0 tcp http SF 162 4528 0 0 0 0 ... 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 normal.
4 0 tcp http SF 236 1228 0 0 0 0 ... 2 1.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 normal.

5 rows × 42 columns

CPU times: total: 19.8 s
Wall time: 21.2 s
In [5]:
%%time
#avec polars
dPolars = polars.read_csv("kddcup99twice.txt")
display(dPolars.head())
shape: (5, 42)
V1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28V29V30V31V32V33V34V35V36V37V38V39V40V41V42
i64strstrstri64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64f64f64f64f64f64f64f64i64i64f64f64f64f64f64f64f64f64str
0"tcp""http""SF"215450760000010000000000110.00.00.00.01.00.00.0000.00.00.00.00.00.00.00.0"normal."
0"tcp""http""SF"215450760000010000000000110.00.00.00.01.00.00.0000.00.00.00.00.00.00.00.0"normal."
0"tcp""http""SF"16245280000010000000000220.00.00.00.01.00.00.0111.00.01.00.00.00.00.00.0"normal."
0"tcp""http""SF"16245280000010000000000220.00.00.00.01.00.00.0111.00.01.00.00.00.00.00.0"normal."
0"tcp""http""SF"23612280000010000000000110.00.00.00.01.00.00.0221.00.00.50.00.00.00.00.0"normal."
CPU times: total: 10.5 s
Wall time: 1.96 s
In [6]:
#types des objets
print(type(dPandas))
print(type(dPolars))
<class 'pandas.core.frame.DataFrame'>
<class 'polars.dataframe.frame.DataFrame'>

Filtrage¶

In [7]:
%%time
#requête Pandas - 382696 obs.
res = dPandas.loc[(dPandas.V42=="normal.") & (dPandas.V2=="udp")].shape[0]
display(res)
382696
CPU times: total: 859 ms
Wall time: 976 ms
In [8]:
%%time
#requête Polars - 382696 obs.
res = dPolars.filter((polars.col("V42")=="normal.") & (polars.col("V2")=="udp")).shape[0]
display(res)
382696
CPU times: total: 141 ms
Wall time: 58.4 ms

Moyennes pour variables numériques¶

In [9]:
%%time
#moyennes avec Pandas
res = dPandas.mean(axis=0,numeric_only=True)
display(res.sum())
np.float64(4032.3814143100103)
CPU times: total: 609 ms
Wall time: 645 ms
In [10]:
%%time
# Sélectionner uniquement les colonnes numériques
numeric_columns = dPolars.select(polars.col(polars.Float64, polars.Int64))
# Calculer les moyennes pour toutes les colonnes numériques
mean_df = numeric_columns.select(
    [polars.col(col).mean().alias(f"mean_{col}") for col in numeric_columns.columns]
)
print(mean_df.sum_horizontal())
shape: (1,)
Series: 'sum' [f64]
[
	4032.381414
]
CPU times: total: 656 ms
Wall time: 109 ms

Filtrage + moyennes¶

In [11]:
%%time
#que les variables quanti toujours
res = dPandas.loc[dPandas.V42=="normal."].mean(axis=0,numeric_only=True)
display(res.sum())
np.float64(5303.223380298341)
CPU times: total: 766 ms
Wall time: 826 ms
In [12]:
%%time
#enchaînement filtrer les individus
#et sélectionner uniquement les colonnes numériques
fn_columns = (dPolars.filter(polars.col("V42")=="normal.")
    .select(polars.col(polars.Float64, polars.Int64)))
# Calculer les moyennes pour toutes les colonnes numériques
mean_df = fn_columns.select(
    [polars.col(col).mean().alias(f"mean_{col}") for col in numeric_columns.columns]
)
print(mean_df.sum_horizontal())
shape: (1,)
Series: 'sum' [f64]
[
	5303.22338
]
CPU times: total: 500 ms
Wall time: 97.4 ms

Tri selon une variable¶

In [13]:
%%time
#tri selon Pandas
res = dPandas.sort_values(by='V6',ascending=False)
display(res.head(1))
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ... V33 V34 V35 V36 V37 V38 V39 V40 V41 V42
2835143 10999 tcp other RSTR 0 1309937401 0 0 0 0 ... 1 0.0 0.65 1.0 0.0 0.0 0.0 1.0 1.0 portsweep.

1 rows × 42 columns

CPU times: total: 2.23 s
Wall time: 2.32 s
In [14]:
%%time
#tri selon Polars
res = dPolars.sort(by=polars.col('V6'),descending=True)
display(res.head(1))
shape: (1, 42)
V1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28V29V30V31V32V33V34V35V36V37V38V39V40V41V42
i64strstrstri64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64i64f64f64f64f64f64f64f64i64i64f64f64f64f64f64f64f64f64str
10999"tcp""other""RSTR"013099374010000000000000000110.00.01.01.01.00.00.025510.00.651.00.00.00.01.01.0"portsweep."
CPU times: total: 22.6 s
Wall time: 5.21 s

Comptage de valeurs¶

In [15]:
%%time
#pandas
display(dPandas.V42.value_counts().sort_values(ascending=False))
V42
smurf.              5615772
neptune.            2144034
normal.             1945562
satan.                31784
ipsweep.              24962
portsweep.            20826
nmap.                  4632
back.                  4406
warezclient.           2040
teardrop.              1958
pod.                    528
guess_passwd.           106
buffer_overflow.         60
land.                    42
warezmaster.             40
imap.                    24
rootkit.                 20
loadmodule.              18
ftp_write.               16
multihop.                14
phf.                      8
perl.                     6
spy.                      4
Name: count, dtype: int64
CPU times: total: 328 ms
Wall time: 509 ms
In [16]:
%%time
#polar
display(dPolars['V42'].value_counts().sort('count',descending=True))
shape: (23, 2)
V42count
stru32
"smurf."5615772
"neptune."2144034
"normal."1945562
"satan."31784
"ipsweep."24962
……
"ftp_write."16
"multihop."14
"phf."8
"perl."6
"spy."4
CPU times: total: 219 ms
Wall time: 292 ms

Moyennes conditionnelles (1 critère)¶

In [17]:
%%time
#pandas
display(dPandas.groupby('V42')['V34'].mean().sort_values())
V42
portsweep.          0.002928
perl.               0.013333
satan.              0.014215
neptune.            0.043371
spy.                0.185000
teardrop.           0.246834
rootkit.            0.306000
nmap.               0.526736
pod.                0.659735
multihop.           0.715714
warezclient.        0.735441
loadmodule.         0.835556
normal.             0.844879
land.               0.870000
ftp_write.          0.875000
warezmaster.        0.900000
imap.               0.916667
ipsweep.            0.930469
phf.                0.972500
smurf.              0.999691
back.               1.000000
guess_passwd.       1.000000
buffer_overflow.    1.000000
Name: V34, dtype: float64
CPU times: total: 406 ms
Wall time: 478 ms
In [18]:
%%time
#polars
res = dPolars.group_by('V42').agg(polars.col('V34').mean())
display(res.sort(by=polars.col("V34")))
shape: (23, 2)
V42V34
strf64
"portsweep."0.002928
"perl."0.013333
"satan."0.014215
"neptune."0.043371
"spy."0.185
……
"phf."0.9725
"smurf."0.999691
"guess_passwd."1.0
"back."1.0
"buffer_overflow."1.0
CPU times: total: 4.2 s
Wall time: 1.14 s

Moyennes conditionnelles (2 critères)¶

In [19]:
%%time
#pandas
display(pandas.pivot_table(dPandas,index='V2',columns='V42',values='V34',aggfunc='mean'))
V42 back. buffer_overflow. ftp_write. guess_passwd. imap. ipsweep. land. loadmodule. multihop. neptune. ... phf. pod. portsweep. rootkit. satan. smurf. spy. teardrop. warezclient. warezmaster.
V2
icmp NaN NaN NaN NaN NaN 0.999392 NaN NaN NaN NaN ... NaN 0.659735 0.338333 NaN 0.209730 0.999691 NaN NaN NaN NaN
tcp 1.0 1.0 0.875 1.0 0.916667 0.068409 0.87 0.835556 0.715714 0.043371 ... 0.9725 NaN 0.002735 0.151429 0.000751 NaN 0.185 NaN 0.735441 0.9
udp NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 0.666667 0.121499 NaN NaN 0.246834 NaN NaN

3 rows × 23 columns

CPU times: total: 1.16 s
Wall time: 1.43 s
In [20]:
%%time
#polars
display(dPolars.pivot('V42',index='V2',values='V34',aggregate_function='mean',sort_columns=True))
shape: (3, 24)
V2back.buffer_overflow.ftp_write.guess_passwd.imap.ipsweep.land.loadmodule.multihop.neptune.nmap.normal.perl.phf.pod.portsweep.rootkit.satan.smurf.spy.teardrop.warezclient.warezmaster.
strf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"tcp"1.01.00.8751.00.9166670.0684090.870.8355560.7157140.0433710.0035590.9048010.0133330.9725null0.0027350.1514290.000751null0.185null0.7354410.9
"udp"nullnullnullnullnullnullnullnullnullnull0.753440.625851nullnullnullnull0.6666670.121499nullnull0.246834nullnull
"icmp"nullnullnullnullnull0.999392nullnullnullnull0.9960080.51977nullnull0.6597350.338333null0.209730.999691nullnullnullnull
CPU times: total: 1.53 s
Wall time: 872 ms