Version¶
In [1]:
#pyspark
import pyspark
pyspark.__version__
Out[1]:
'3.5.4'
Chargement et préparation des données¶
Lecture et inspection¶
In [2]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [3]:
#création d'une session
from pyspark.sql import SparkSession
#configuré localement avec 4 coeurs
#et max de mémoire utilisée 1 GB
spark = SparkSession.builder \
.master("local[*]") \
.config("spark.executor.memory", "1g") \
.config("spark.executor.cores", "4") \
.getOrCreate()
#type de l'objet
print(type(spark))
<class 'pyspark.sql.session.SparkSession'>
In [4]:
#chargement du fichier - séparateur de colonnes "tabulation"
df = spark.read.options(delimiter="\t").csv("market_basket.txt",header=True,inferSchema=True)
#type de l'objet
print(type(df))
<class 'pyspark.sql.dataframe.DataFrame'>
In [5]:
#dimensions
print(f"Lignes : {df.count()}")
print(f"Colonnes : {len(df.columns)}")
Lignes : 12935 Colonnes : 2
In [6]:
#structure de la table
df.printSchema()
root |-- Panier: integer (nullable = true) |-- Produit: string (nullable = true)
In [7]:
#premières lignes
df.show(15,truncate=False)
+------+------------------------+ |Panier|Produit | +------+------------------------+ |1 |Peaches | |2 |Vegetable_Oil | |2 |Frozen_Corn | |3 |Plums | |4 |Pancake_Mix | |5 |Cheese | |6 |Cauliflower | |7 |2pct_Milk | |8 |98pct_Fat_Free_Hamburger| |8 |Potato_Chips | |8 |Sesame_Oil | |8 |Ice_Cream_Sandwich | |8 |Frozen_Cheese_Pizza | |8 |Frozen_Sausage_Pizza | |8 |Deli_Salad | +------+------------------------+ only showing top 15 rows
Préparation des données pour FP-Growth¶
In [8]:
#pour disposer d'une liste de listes
from pyspark.sql.functions import collect_list
dfTrans = df.groupBy(df.Panier).agg(collect_list('Produit'))
#nouveau type
type(dfTrans)
Out[8]:
pyspark.sql.dataframe.DataFrame
In [9]:
#affichage
dfTrans.show(10,truncate=False)
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |Panier|collect_list(Produit) | +------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |1 |[Peaches] | |2 |[Vegetable_Oil, Frozen_Corn] | |3 |[Plums] | |4 |[Pancake_Mix] | |5 |[Cheese] | |6 |[Cauliflower] | |7 |[2pct_Milk] | |8 |[98pct_Fat_Free_Hamburger, Potato_Chips, Sesame_Oil, Ice_Cream_Sandwich, Frozen_Cheese_Pizza, Frozen_Sausage_Pizza, Deli_Salad, Brown_Sugar, Corn_Chips, Merlot_Wine, White_Bread, Diet_Cola, Peanut_Butter_Cookies, Pretzels, Lollipops, Baked_Beans, Salt]| |9 |[Avocado_Dip] | |10 |[Toilet_Paper] | +------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ only showing top 10 rows
In [10]:
#renommer la colonne des produits
dfTrans = dfTrans.withColumnRenamed('collect_list(Produit)','Items')
dfTrans.show(5,truncate=False)
+------+----------------------------+ |Panier|Items | +------+----------------------------+ |1 |[Peaches] | |2 |[Vegetable_Oil, Frozen_Corn]| |3 |[Plums] | |4 |[Pancake_Mix] | |5 |[Cheese] | +------+----------------------------+ only showing top 5 rows
In [11]:
#nombre de transactions (ou de paniers)
dfTrans.count()
Out[11]:
1360
Extraction des itemsets fréquents¶
In [12]:
#algorithme
from pyspark.ml.fpm import FPGrowth
#instanciation et paramétrage (itemsets fréquents ET règles)
fp = FPGrowth(itemsCol="Items",minSupport=0.03,minConfidence=0.75)
fpm = fp.fit(dfTrans)
#type de l'objet obtenu
type(fpm)
Out[12]:
pyspark.ml.fpm.FPGrowthModel
In [13]:
#accès aux itemsets fréquents
freq_itemsets = fpm.freqItemsets
#type de l'objet -> data frame
type(freq_itemsets)
Out[13]:
pyspark.sql.dataframe.DataFrame
In [14]:
#affichage des 10 premières lignes
freq_itemsets.show(10,truncate=False)
+-----------------------+----+ |items |freq| +-----------------------+----+ |[Vanilla_Wafers] |41 | |[Lemons] |41 | |[Hot_Chocolate] |46 | |[Apple_Jelly] |45 | |[Creamy_Peanut_Butter] |47 | |[75_Watt_Lightbulb] |43 | |[Cream_Soda] |43 | |[Frozen_Corn] |42 | |[Honey_Roasted_Peanuts]|45 | |[Bananas] |81 | +-----------------------+----+ only showing top 10 rows
In [15]:
#si on veut obtenir le support en termes relatifs
#création d'une nouvelle colonne dans le data frame
#fréquence absolue / nombre de transactions (paniers)
freq_itemsets = freq_itemsets.withColumn("support",freq_itemsets.freq/dfTrans.count())
freq_itemsets.show(10,truncate=False)
+-----------------------+----+--------------------+ |items |freq|support | +-----------------------+----+--------------------+ |[Vanilla_Wafers] |41 |0.030147058823529412| |[Lemons] |41 |0.030147058823529412| |[Hot_Chocolate] |46 |0.033823529411764704| |[Apple_Jelly] |45 |0.03308823529411765 | |[Creamy_Peanut_Butter] |47 |0.03455882352941177 | |[75_Watt_Lightbulb] |43 |0.03161764705882353 | |[Cream_Soda] |43 |0.03161764705882353 | |[Frozen_Corn] |42 |0.030882352941176472| |[Honey_Roasted_Peanuts]|45 |0.03308823529411765 | |[Bananas] |81 |0.05955882352941176 | +-----------------------+----+--------------------+ only showing top 10 rows
In [16]:
#trier selon le support décroissant
freq_itemsets.orderBy(freq_itemsets.support.desc()).show(10,truncate=False)
+--------------------------+----+-------------------+ |items |freq|support | +--------------------------+----+-------------------+ |[Eggs] |167 |0.12279411764705882| |[White_Bread] |162 |0.11911764705882352| |[2pct_Milk] |149 |0.10955882352941176| |[Potato_Chips] |133 |0.09779411764705882| |[98pct_Fat_Free_Hamburger]|127 |0.09338235294117647| |[Hot_Dogs] |126 |0.09264705882352942| |[Potatoes] |118 |0.08676470588235294| |[Sweet_Relish] |116 |0.08529411764705883| |[Onions] |109 |0.08014705882352942| |[Toothpaste] |108 |0.07941176470588235| +--------------------------+----+-------------------+ only showing top 10 rows
In [17]:
#type d'élément dans la colonne items
#nous avons un array[string]
freq_itemsets.printSchema()
root |-- items: array (nullable = false) | |-- element: string (containsNull = false) |-- freq: long (nullable = false) |-- support: double (nullable = true)
In [18]:
#alias pour fonctions
import pyspark.sql.functions as F
#cardinal d'items - 10 premiers
freq_itemsets.select(F.size(freq_itemsets.items)).show(10)
+-----------+ |size(items)| +-----------+ | 1| | 1| | 1| | 1| | 1| | 1| | 1| | 1| | 1| | 1| +-----------+ only showing top 10 rows
In [19]:
#si on veut comptabiliser le nombre d'freq_itemsets
#par cardinal
freq_itemsets.select(F.size(freq_itemsets.items)).groupBy('size(items)').count().show()
+-----------+-----+ |size(items)|count| +-----------+-----+ | 1| 148| | 3| 17| | 2| 140| +-----------+-----+
In [20]:
#les itemsets de cardinal > 1
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.size(F.col("items")) > 1)
res.orderBy(res.support.desc()).show(10,truncate=False)
+------------------------------------------+----+--------------------+ |items |freq|support | +------------------------------------------+----+--------------------+ |[White_Bread, Eggs] |75 |0.05514705882352941 | |[2pct_Milk, Eggs] |71 |0.05220588235294118 | |[2pct_Milk, White_Bread] |70 |0.051470588235294115| |[Potato_Chips, White_Bread] |70 |0.051470588235294115| |[98pct_Fat_Free_Hamburger, White_Bread] |67 |0.04926470588235294 | |[Hamburger_Buns, 98pct_Fat_Free_Hamburger]|66 |0.04852941176470588 | |[Potato_Chips, Eggs] |66 |0.04852941176470588 | |[Toothpaste, White_Bread] |65 |0.04779411764705882 | |[Sweet_Relish, Hot_Dogs] |64 |0.047058823529411764| |[98pct_Fat_Free_Hamburger, Eggs] |62 |0.045588235294117645| +------------------------------------------+----+--------------------+ only showing top 10 rows
In [21]:
#filtrage
#liste des itemsets contenant le produit "Eggs"
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.array_contains(F.col("items"),"Eggs"))
res.orderBy(res.support.desc()).show(10,truncate=False)
+--------------------------------+----+--------------------+ |items |freq|support | +--------------------------------+----+--------------------+ |[Eggs] |167 |0.12279411764705882 | |[White_Bread, Eggs] |75 |0.05514705882352941 | |[2pct_Milk, Eggs] |71 |0.05220588235294118 | |[Potato_Chips, Eggs] |66 |0.04852941176470588 | |[98pct_Fat_Free_Hamburger, Eggs]|62 |0.045588235294117645| |[Potatoes, Eggs] |62 |0.045588235294117645| |[Sweet_Relish, Eggs] |61 |0.04485294117647059 | |[Toothpaste, Eggs] |61 |0.04485294117647059 | |[Hot_Dogs, Eggs] |58 |0.04264705882352941 | |[Cola, Eggs] |55 |0.04044117647058824 | +--------------------------------+----+--------------------+ only showing top 10 rows
In [22]:
#liste des itemsets contenant le produit "Eggs" et "White_Bread"
#triés de manière décroissante selon le support
res = freq_itemsets.filter(F.array_contains(F.col("items"),"Eggs") &
F.array_contains(F.col("items"),"White_Bread"))
res.orderBy(res.support.desc()).show(10,truncate=False)
+---------------------------------------------+----+--------------------+ |items |freq|support | +---------------------------------------------+----+--------------------+ |[White_Bread, Eggs] |75 |0.05514705882352941 | |[2pct_Milk, White_Bread, Eggs] |49 |0.03602941176470588 | |[Potato_Chips, White_Bread, Eggs] |46 |0.033823529411764704| |[Toothpaste, White_Bread, Eggs] |44 |0.03235294117647059 | |[Potatoes, White_Bread, Eggs] |42 |0.030882352941176472| |[Wheat_Bread, White_Bread, Eggs] |41 |0.030147058823529412| |[98pct_Fat_Free_Hamburger, White_Bread, Eggs]|41 |0.030147058823529412| |[Cola, White_Bread, Eggs] |41 |0.030147058823529412| +---------------------------------------------+----+--------------------+
Extraction (déduction) des règles¶
In [23]:
#récupération des règles
regles = fpm.associationRules
#type de l'objet
type(regles)
Out[23]:
pyspark.sql.dataframe.DataFrame
In [24]:
#structure
regles.printSchema()
root |-- antecedent: array (nullable = false) | |-- element: string (containsNull = false) |-- consequent: array (nullable = false) | |-- element: string (containsNull = false) |-- confidence: double (nullable = false) |-- lift: double (nullable = true) |-- support: double (nullable = false)
In [25]:
#nombre de règles
regles.count()
Out[25]:
5
In [26]:
#affichage des règles
regles.show(truncate=False)
+----------------------------+-------------+------------------+-----------------+--------------------+ |antecedent |consequent |confidence |lift |support | +----------------------------+-------------+------------------+-----------------+--------------------+ |[Potato_Chips, 2pct_Milk] |[Eggs] |0.7540983606557377|6.141160302346127|0.033823529411764704| |[Toothpaste, Potato_Chips] |[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412| |[Toothpaste, 2pct_Milk] |[White_Bread]|0.7627118644067796|6.4030131826742 |0.03308823529411765 | |[Wheat_Bread, Eggs] |[2pct_Milk] |0.7636363636363637|6.970103721781575|0.030882352941176472| |[Hot_Dog_Buns, Sweet_Relish]|[Hot_Dogs] |0.8367346938775511|9.031422092646583|0.030147058823529412| +----------------------------+-------------+------------------+-----------------+--------------------+
In [27]:
#trier les règles selon le lift décroissant
regles.orderBy(regles.lift.desc()).show(truncate=False)
+----------------------------+-------------+------------------+-----------------+--------------------+ |antecedent |consequent |confidence |lift |support | +----------------------------+-------------+------------------+-----------------+--------------------+ |[Hot_Dog_Buns, Sweet_Relish]|[Hot_Dogs] |0.8367346938775511|9.031422092646583|0.030147058823529412| |[Wheat_Bread, Eggs] |[2pct_Milk] |0.7636363636363637|6.970103721781575|0.030882352941176472| |[Toothpaste, Potato_Chips] |[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412| |[Toothpaste, 2pct_Milk] |[White_Bread]|0.7627118644067796|6.4030131826742 |0.03308823529411765 | |[Potato_Chips, 2pct_Milk] |[Eggs] |0.7540983606557377|6.141160302346127|0.033823529411764704| +----------------------------+-------------+------------------+-----------------+--------------------+
In [28]:
#règles qui mènent à "White_Bread"
regles.filter(F.array_contains(F.col("consequent"),"White_Bread")).show(truncate=False)
+--------------------------+-------------+------------------+-----------------+--------------------+ |antecedent |consequent |confidence |lift |support | +--------------------------+-------------+------------------+-----------------+--------------------+ |[Toothpaste, Potato_Chips]|[White_Bread]|0.803921568627451 |6.748971193415638|0.030147058823529412| |[Toothpaste, 2pct_Milk] |[White_Bread]|0.7627118644067796|6.4030131826742 |0.03308823529411765 | +--------------------------+-------------+------------------+-----------------+--------------------+
Stopper la session¶
In [29]:
#stop
spark.stop()