Version¶

In [1]:
#pyspark
import pyspark
pyspark.__version__
Out[1]:
'3.5.4'

Chargement et préparation des données¶

Lecture et inspection¶

In [2]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [3]:
#création d'une session
from pyspark.sql import SparkSession
#configuré localement avec 4 coeurs
#et max de mémoire utilisée 1 GB
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

#type de l'objet
print(type(spark))
<class 'pyspark.sql.session.SparkSession'>
In [4]:
#chargement du fichier - séparateur de colonnes "tabulation"
df = spark.read.options(delimiter="\t").csv("market_basket.txt",header=True,inferSchema=True)

#type de l'objet
print(type(df))
<class 'pyspark.sql.dataframe.DataFrame'>
In [5]:
#dimensions
print(f"Lignes : {df.count()}")
print(f"Colonnes : {len(df.columns)}")
Lignes : 12935
Colonnes : 2
In [6]:
#structure de la table
df.printSchema()
root
 |-- Panier: integer (nullable = true)
 |-- Produit: string (nullable = true)

In [7]:
#premières lignes
df.show(15,truncate=False)
+------+------------------------+
|Panier|Produit                 |
+------+------------------------+
|1     |Peaches                 |
|2     |Vegetable_Oil           |
|2     |Frozen_Corn             |
|3     |Plums                   |
|4     |Pancake_Mix             |
|5     |Cheese                  |
|6     |Cauliflower             |
|7     |2pct_Milk               |
|8     |98pct_Fat_Free_Hamburger|
|8     |Potato_Chips            |
|8     |Sesame_Oil              |
|8     |Ice_Cream_Sandwich      |
|8     |Frozen_Cheese_Pizza     |
|8     |Frozen_Sausage_Pizza    |
|8     |Deli_Salad              |
+------+------------------------+
only showing top 15 rows

Préparation des données pour FP-Growth¶

In [8]:
#pour disposer d'une liste de listes
from pyspark.sql.functions import collect_list
dfTrans = df.groupBy(df.Panier).agg(collect_list('Produit'))

#nouveau type
type(dfTrans)
Out[8]:
pyspark.sql.dataframe.DataFrame
In [9]:
#affichage
dfTrans.show(10,truncate=False)
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Panier|collect_list(Produit)                                                                                                                                                                                                                                       |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[Peaches]                                                                                                                                                                                                                                                   |
|2     |[Vegetable_Oil, Frozen_Corn]                                                                                                                                                                                                                                |
|3     |[Plums]                                                                                                                                                                                                                                                     |
|4     |[Pancake_Mix]                                                                                                                                                                                                                                               |
|5     |[Cheese]                                                                                                                                                                                                                                                    |
|6     |[Cauliflower]                                                                                                                                                                                                                                               |
|7     |[2pct_Milk]                                                                                                                                                                                                                                                 |
|8     |[98pct_Fat_Free_Hamburger, Potato_Chips, Sesame_Oil, Ice_Cream_Sandwich, Frozen_Cheese_Pizza, Frozen_Sausage_Pizza, Deli_Salad, Brown_Sugar, Corn_Chips, Merlot_Wine, White_Bread, Diet_Cola, Peanut_Butter_Cookies, Pretzels, Lollipops, Baked_Beans, Salt]|
|9     |[Avocado_Dip]                                                                                                                                                                                                                                               |
|10    |[Toilet_Paper]                                                                                                                                                                                                                                              |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 10 rows

In [10]:
#renommer la colonne des produits
dfTrans = dfTrans.withColumnRenamed('collect_list(Produit)','Items')
dfTrans.show(5,truncate=False)
+------+----------------------------+
|Panier|Items                       |
+------+----------------------------+
|1     |[Peaches]                   |
|2     |[Vegetable_Oil, Frozen_Corn]|
|3     |[Plums]                     |
|4     |[Pancake_Mix]               |
|5     |[Cheese]                    |
+------+----------------------------+
only showing top 5 rows

In [11]:
#nombre de transactions (ou de paniers)
dfTrans.count()
Out[11]:
1360

Extraction des itemsets fréquents¶

In [12]:
#algorithme
from pyspark.ml.fpm import FPGrowth

#instanciation et paramétrage (itemsets fréquents ET règles)
fp = FPGrowth(itemsCol="Items",minSupport=0.03,minConfidence=0.75)
fpm = fp.fit(dfTrans)

#type de l'objet obtenu
type(fpm)
Out[12]:
pyspark.ml.fpm.FPGrowthModel
In [13]:
#accès aux itemsets fréquents
freq_itemsets = fpm.freqItemsets

#type de l'objet -> data frame
type(freq_itemsets)
Out[13]:
pyspark.sql.dataframe.DataFrame
In [14]:
#affichage des 10 premières lignes
freq_itemsets.show(10,truncate=False)
+-----------------------+----+
|items                  |freq|
+-----------------------+----+
|[Vanilla_Wafers]       |41  |
|[Lemons]               |41  |
|[Hot_Chocolate]        |46  |
|[Apple_Jelly]          |45  |
|[Creamy_Peanut_Butter] |47  |
|[75_Watt_Lightbulb]    |43  |
|[Cream_Soda]           |43  |
|[Frozen_Corn]          |42  |
|[Honey_Roasted_Peanuts]|45  |
|[Bananas]              |81  |
+-----------------------+----+
only showing top 10 rows

In [15]:
#si on veut obtenir le support en termes relatifs
#création d'une nouvelle colonne dans le data frame
#fréquence absolue / nombre de transactions (paniers)
freq_itemsets = freq_itemsets.withColumn("support",freq_itemsets.freq/dfTrans.count())
freq_itemsets.show(10,truncate=False)
+-----------------------+----+--------------------+
|items                  |freq|support             |
+-----------------------+----+--------------------+
|[Vanilla_Wafers]       |41  |0.030147058823529412|
|[Lemons]               |41  |0.030147058823529412|
|[Hot_Chocolate]        |46  |0.033823529411764704|
|[Apple_Jelly]          |45  |0.03308823529411765 |
|[Creamy_Peanut_Butter] |47  |0.03455882352941177 |
|[75_Watt_Lightbulb]    |43  |0.03161764705882353 |
|[Cream_Soda]           |43  |0.03161764705882353 |
|[Frozen_Corn]          |42  |0.030882352941176472|
|[Honey_Roasted_Peanuts]|45  |0.03308823529411765 |
|[Bananas]              |81  |0.05955882352941176 |
+-----------------------+----+--------------------+
only showing top 10 rows

In [16]:
#trier selon le support décroissant
freq_itemsets.orderBy(freq_itemsets.support.desc()).show(10,truncate=False)
+--------------------------+----+-------------------+
|items                     |freq|support            |
+--------------------------+----+-------------------+
|[Eggs]                    |167 |0.12279411764705882|
|[White_Bread]             |162 |0.11911764705882352|
|[2pct_Milk]               |149 |0.10955882352941176|
|[Potato_Chips]            |133 |0.09779411764705882|
|[98pct_Fat_Free_Hamburger]|127 |0.09338235294117647|
|[Hot_Dogs]                |126 |0.09264705882352942|
|[Potatoes]                |118 |0.08676470588235294|
|[Sweet_Relish]            |116 |0.08529411764705883|
|[Onions]                  |109 |0.08014705882352942|
|[Toothpaste]              |108 |0.07941176470588235|
+--------------------------+----+-------------------+
only showing top 10 rows

In [17]:
#type d'élément dans la colonne items
#nous avons un array[string]
freq_itemsets.printSchema()
root
 |-- items: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- freq: long (nullable = false)
 |-- support: double (nullable = true)

In [18]:
#alias pour fonctions
import pyspark.sql.functions as F

#cardinal d'items - 10 premiers
freq_itemsets.select(F.size(freq_itemsets.items)).show(10)
+-----------+
|size(items)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 10 rows

In [19]:
#si on veut comptabiliser le nombre d'freq_itemsets
#par cardinal
freq_itemsets.select(F.size(freq_itemsets.items)).groupBy('size(items)').count().show()
+-----------+-----+
|size(items)|count|
+-----------+-----+
|          1|  148|
|          3|   17|
|          2|  140|
+-----------+-----+

In [20]:
#les itemsets de cardinal > 1
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.size(F.col("items")) > 1)
res.orderBy(res.support.desc()).show(10,truncate=False)
+------------------------------------------+----+--------------------+
|items                                     |freq|support             |
+------------------------------------------+----+--------------------+
|[White_Bread, Eggs]                       |75  |0.05514705882352941 |
|[2pct_Milk, Eggs]                         |71  |0.05220588235294118 |
|[2pct_Milk, White_Bread]                  |70  |0.051470588235294115|
|[Potato_Chips, White_Bread]               |70  |0.051470588235294115|
|[98pct_Fat_Free_Hamburger, White_Bread]   |67  |0.04926470588235294 |
|[Hamburger_Buns, 98pct_Fat_Free_Hamburger]|66  |0.04852941176470588 |
|[Potato_Chips, Eggs]                      |66  |0.04852941176470588 |
|[Toothpaste, White_Bread]                 |65  |0.04779411764705882 |
|[Sweet_Relish, Hot_Dogs]                  |64  |0.047058823529411764|
|[98pct_Fat_Free_Hamburger, Eggs]          |62  |0.045588235294117645|
+------------------------------------------+----+--------------------+
only showing top 10 rows

In [21]:
#filtrage
#liste des itemsets contenant le produit "Eggs"
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.array_contains(F.col("items"),"Eggs"))
res.orderBy(res.support.desc()).show(10,truncate=False)
+--------------------------------+----+--------------------+
|items                           |freq|support             |
+--------------------------------+----+--------------------+
|[Eggs]                          |167 |0.12279411764705882 |
|[White_Bread, Eggs]             |75  |0.05514705882352941 |
|[2pct_Milk, Eggs]               |71  |0.05220588235294118 |
|[Potato_Chips, Eggs]            |66  |0.04852941176470588 |
|[98pct_Fat_Free_Hamburger, Eggs]|62  |0.045588235294117645|
|[Potatoes, Eggs]                |62  |0.045588235294117645|
|[Sweet_Relish, Eggs]            |61  |0.04485294117647059 |
|[Toothpaste, Eggs]              |61  |0.04485294117647059 |
|[Hot_Dogs, Eggs]                |58  |0.04264705882352941 |
|[Cola, Eggs]                    |55  |0.04044117647058824 |
+--------------------------------+----+--------------------+
only showing top 10 rows

In [22]:
#liste des itemsets contenant le produit "Eggs" et "White_Bread"
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.array_contains(F.col("items"),"Eggs") &
                           F.array_contains(F.col("items"),"White_Bread"))
res.orderBy(res.support.desc()).show(10,truncate=False)
+---------------------------------------------+----+--------------------+
|items                                        |freq|support             |
+---------------------------------------------+----+--------------------+
|[White_Bread, Eggs]                          |75  |0.05514705882352941 |
|[2pct_Milk, White_Bread, Eggs]               |49  |0.03602941176470588 |
|[Potato_Chips, White_Bread, Eggs]            |46  |0.033823529411764704|
|[Toothpaste, White_Bread, Eggs]              |44  |0.03235294117647059 |
|[Potatoes, White_Bread, Eggs]                |42  |0.030882352941176472|
|[Wheat_Bread, White_Bread, Eggs]             |41  |0.030147058823529412|
|[98pct_Fat_Free_Hamburger, White_Bread, Eggs]|41  |0.030147058823529412|
|[Cola, White_Bread, Eggs]                    |41  |0.030147058823529412|
+---------------------------------------------+----+--------------------+

Extraction (déduction) des règles¶

In [23]:
#récupération des règles
regles = fpm.associationRules

#type de l'objet
type(regles)
Out[23]:
pyspark.sql.dataframe.DataFrame
In [24]:
#structure
regles.printSchema()
root
 |-- antecedent: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- consequent: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- confidence: double (nullable = false)
 |-- lift: double (nullable = true)
 |-- support: double (nullable = false)

In [25]:
#nombre de règles
regles.count()
Out[25]:
5
In [26]:
#affichage des règles
regles.show(truncate=False)
+----------------------------+-------------+------------------+-----------------+--------------------+
|antecedent                  |consequent   |confidence        |lift             |support             |
+----------------------------+-------------+------------------+-----------------+--------------------+
|[Potato_Chips, 2pct_Milk]   |[Eggs]       |0.7540983606557377|6.141160302346127|0.033823529411764704|
|[Toothpaste, Potato_Chips]  |[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412|
|[Toothpaste, 2pct_Milk]     |[White_Bread]|0.7627118644067796|6.4030131826742  |0.03308823529411765 |
|[Wheat_Bread, Eggs]         |[2pct_Milk]  |0.7636363636363637|6.970103721781575|0.030882352941176472|
|[Hot_Dog_Buns, Sweet_Relish]|[Hot_Dogs]   |0.8367346938775511|9.031422092646583|0.030147058823529412|
+----------------------------+-------------+------------------+-----------------+--------------------+

In [27]:
#trier les règles selon le lift décroissant
regles.orderBy(regles.lift.desc()).show(truncate=False)
+----------------------------+-------------+------------------+-----------------+--------------------+
|antecedent                  |consequent   |confidence        |lift             |support             |
+----------------------------+-------------+------------------+-----------------+--------------------+
|[Hot_Dog_Buns, Sweet_Relish]|[Hot_Dogs]   |0.8367346938775511|9.031422092646583|0.030147058823529412|
|[Wheat_Bread, Eggs]         |[2pct_Milk]  |0.7636363636363637|6.970103721781575|0.030882352941176472|
|[Toothpaste, Potato_Chips]  |[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412|
|[Toothpaste, 2pct_Milk]     |[White_Bread]|0.7627118644067796|6.4030131826742  |0.03308823529411765 |
|[Potato_Chips, 2pct_Milk]   |[Eggs]       |0.7540983606557377|6.141160302346127|0.033823529411764704|
+----------------------------+-------------+------------------+-----------------+--------------------+

In [28]:
#règles qui mènent à "White_Bread"
regles.filter(F.array_contains(F.col("consequent"),"White_Bread")).show(truncate=False)
+--------------------------+-------------+------------------+-----------------+--------------------+
|antecedent                |consequent   |confidence        |lift             |support             |
+--------------------------+-------------+------------------+-----------------+--------------------+
|[Toothpaste, Potato_Chips]|[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412|
|[Toothpaste, 2pct_Milk]   |[White_Bread]|0.7627118644067796|6.4030131826742  |0.03308823529411765 |
+--------------------------+-------------+------------------+-----------------+--------------------+

Stopper la session¶

In [29]:
#stop
spark.stop()