Version¶

In [1]:
#import
import mlxtend
mlxtend.__version__
Out[1]:
'0.23.4'

Importation et inspection des données¶

In [2]:
#dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement
import pandas
D = pandas.read_excel("market_basket.xlsx")
D.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12935 entries, 0 to 12934
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Panier   12935 non-null  int64 
 1   Produit  12935 non-null  object
dtypes: int64(1), object(1)
memory usage: 202.2+ KB
In [3]:
#produits par panier
freqPanier = D.Panier.value_counts(sort=True)
freqPanier
Out[3]:
Panier
650     303
767     120
154     120
810     113
492     109
       ... 
1358      1
1356      1
1355      1
1354      1
1353      1
Name: count, Length: 1360, dtype: int64
In [4]:
#en graphique - les 10 les plus fournis
import matplotlib.pyplot as plt
plt.bar(x=[str(i) for i in freqPanier.index[:10]],height=freqPanier[:10])
Out[4]:
<BarContainer object of 10 artists>
No description has been provided for this image
In [5]:
#fréquence des produits
freqProd = D.Produit.value_counts()
freqProd
Out[5]:
Produit
Eggs                        167
White_Bread                 162
2pct_Milk                   149
Potato_Chips                133
98pct_Fat_Free_Hamburger    127
                           ... 
Daily_Newspaper               8
Plain_English_Muffins         8
Brown_Sugar_Grits             8
Oats_and_Nuts_Cereal          7
Celery                        7
Name: count, Length: 303, dtype: int64
In [6]:
#les 10 premiers
import numpy
plt.barh(numpy.flip(freqProd.index[:10]),width=numpy.flip(freqProd[:10]))
Out[6]:
<BarContainer object of 10 artists>
No description has been provided for this image

Itemsets fréquents - A priori¶

Préparation des données¶

In [7]:
#préparation des données - tableau binaire (de booléens)
TB = pandas.crosstab(D.Panier,D.Produit).astype(bool)
type(TB)
Out[7]:
pandas.core.frame.DataFrame
In [8]:
#dimensions
TB.shape
Out[8]:
(1360, 303)
In [9]:
#qqs. valeurs
TB.iloc[:10,:10]
Out[9]:
Produit 100_Watt_Lightbulb 2pct_Milk 40_Watt_Lightbulb 60_Watt_Lightbulb 75_Watt_Lightbulb 98pct_Fat_Free_Hamburger AA_Cell_Batteries Apple_Cinnamon_Waffles Apple_Drink Apple_Fruit_Roll
Panier
1 False False False False False False False False False False
2 False False False False False False False False False False
3 False False False False False False False False False False
4 False False False False False False False False False False
5 False False False False False False False False False False
6 False False False False False False False False False False
7 False True False False False False False False False False
8 False False False False False True False False False False
9 False False False False False False False False False False
10 False False False False False False False False False False
In [10]:
#vérification - paniers
TB.sum(axis=1).sort_values(ascending=False)
Out[10]:
Panier
650     303
767     120
154     120
810     113
492     109
       ... 
1358      1
1356      1
1355      1
1354      1
1353      1
Length: 1360, dtype: int64
In [11]:
#vérification - produits
TB.sum(axis=0).sort_values(ascending=False)
Out[11]:
Produit
Eggs                        167
White_Bread                 162
2pct_Milk                   149
Potato_Chips                133
98pct_Fat_Free_Hamburger    127
                           ... 
Daily_Newspaper               8
Brown_Sugar_Grits             8
Chicken_Legs                  8
Oats_and_Nuts_Cereal          7
Celery                        7
Length: 303, dtype: int64

Extraction des itemsets fréquents¶

In [12]:
#importation de la fonction apriori
from mlxtend.frequent_patterns import apriori
#itemsets frequents
freq_itemsets = apriori(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame
type(freq_itemsets)
Out[12]:
pandas.core.frame.DataFrame
In [13]:
#colonnes
freq_itemsets.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 603 entries, 0 to 602
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   603 non-null    float64
 1   itemsets  603 non-null    object 
dtypes: float64(1), object(1)
memory usage: 9.6+ KB
In [14]:
#premières lignes
freq_itemsets.head(10)
Out[14]:
support itemsets
0 0.030147 (100_Watt_Lightbulb)
1 0.109559 (2pct_Milk)
2 0.037500 (60_Watt_Lightbulb)
3 0.031618 (75_Watt_Lightbulb)
4 0.093382 (98pct_Fat_Free_Hamburger)
5 0.031618 (AA_Cell_Batteries)
6 0.025735 (Apple_Cinnamon_Waffles)
7 0.026471 (Apple_Drink)
8 0.031618 (Apple_Fruit_Roll)
9 0.032353 (Apple_Jam)

Post-traitement - Filtrage...¶

In [15]:
#type de l'itemsets
type(freq_itemsets.itemsets)
Out[15]:
pandas.core.series.Series
In [16]:
#affichage du premier itemset
freq_itemsets.itemsets.iloc[0]
Out[16]:
frozenset({'100_Watt_Lightbulb'})
In [17]:
#type des valeurs de l'itemsets
type(freq_itemsets.itemsets.iloc[0])
Out[17]:
frozenset
In [18]:
#cardinal d'un itemset
len(freq_itemsets.itemsets.iloc[0])
Out[18]:
1

... Selon le cardinal

In [19]:
#cardinal des itemsets
freq_itemsets.itemsets.apply(lambda x: len(x)).value_counts()
Out[19]:
itemsets
2    300
1    206
3     94
4      3
Name: count, dtype: int64
In [20]:
#itemsets de cardinal == 3
freq_itemsets.loc[freq_itemsets.itemsets.apply(lambda x: len(x) == 3)]
Out[20]:
support itemsets
506 0.027206 (2pct_Milk, 98pct_Fat_Free_Hamburger, Eggs)
507 0.027206 (2pct_Milk, Potato_Chips, 98pct_Fat_Free_Hambu...
508 0.026471 (2pct_Milk, White_Bread, 98pct_Fat_Free_Hambur...
509 0.025735 (2pct_Milk, Aspirin, Eggs)
510 0.025000 (2pct_Milk, Aspirin, Potato_Chips)
... ... ...
595 0.025735 (Toothpaste, White_Bread, Potatoes)
596 0.027941 (Toothpaste, White_Bread, Sweet_Relish)
597 0.025735 (Toothpaste, Toilet_Paper, White_Bread)
598 0.025735 (Toothpaste, White_Bread, Tomatoes)
599 0.026471 (Toothpaste, Wheat_Bread, White_Bread)

94 rows × 2 columns

... Selon l'apparition des items

In [21]:
#itemset avec uniquement l'item "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.eq({'White_Bread'})]
Out[21]:
support itemsets
199 0.119118 (White_Bread)
In [22]:
#itemsets incluant l'item "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread'})]
Out[22]:
support itemsets
199 0.119118 (White_Bread)
248 0.051471 (2pct_Milk, White_Bread)
267 0.049265 (White_Bread, 98pct_Fat_Free_Hamburger)
271 0.029412 (Apples, White_Bread)
286 0.041912 (Aspirin, White_Bread)
... ... ...
598 0.025735 (Toothpaste, White_Bread, Tomatoes)
599 0.026471 (Toothpaste, Wheat_Bread, White_Bread)
600 0.026471 (2pct_Milk, Potato_Chips, White_Bread, Eggs)
601 0.025735 (2pct_Milk, Toothpaste, White_Bread, Eggs)
602 0.025000 (2pct_Milk, Toothpaste, Potato_Chips, White_Br...

103 rows × 2 columns

In [23]:
#soit...
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread'})].shape
Out[23]:
(103, 2)
In [24]:
#itemsets incluant "White_Bread" et "Eggs"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread','Eggs'})]
Out[24]:
support itemsets
378 0.055147 (White_Bread, Eggs)
527 0.036029 (2pct_Milk, White_Bread, Eggs)
548 0.030147 (White_Bread, 98pct_Fat_Free_Hamburger, Eggs)
553 0.029412 (Aspirin, White_Bread, Eggs)
558 0.030147 (White_Bread, Eggs, Cola)
564 0.028676 (Hot_Dogs, White_Bread, Eggs)
567 0.027206 (Onions, White_Bread, Eggs)
568 0.025735 (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs)
570 0.027206 (Popcorn_Salt, White_Bread, Eggs)
575 0.033824 (Potato_Chips, White_Bread, Eggs)
577 0.030882 (White_Bread, Eggs, Potatoes)
578 0.027206 (Sugar_Cookies, White_Bread, Eggs)
580 0.026471 (White_Bread, Sweet_Relish, Eggs)
581 0.025735 (Toilet_Paper, Eggs, White_Bread)
582 0.026471 (Tomatoes, White_Bread, Eggs)
583 0.032353 (Toothpaste, White_Bread, Eggs)
584 0.030147 (Wheat_Bread, Eggs, White_Bread)
600 0.026471 (2pct_Milk, Potato_Chips, White_Bread, Eggs)
601 0.025735 (2pct_Milk, Toothpaste, White_Bread, Eggs)
In [25]:
#peu importe l'ordre ici : "Eggs" et "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'Eggs','White_Bread'})]
Out[25]:
support itemsets
378 0.055147 (White_Bread, Eggs)
527 0.036029 (2pct_Milk, White_Bread, Eggs)
548 0.030147 (White_Bread, 98pct_Fat_Free_Hamburger, Eggs)
553 0.029412 (Aspirin, White_Bread, Eggs)
558 0.030147 (White_Bread, Eggs, Cola)
564 0.028676 (Hot_Dogs, White_Bread, Eggs)
567 0.027206 (Onions, White_Bread, Eggs)
568 0.025735 (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs)
570 0.027206 (Popcorn_Salt, White_Bread, Eggs)
575 0.033824 (Potato_Chips, White_Bread, Eggs)
577 0.030882 (White_Bread, Eggs, Potatoes)
578 0.027206 (Sugar_Cookies, White_Bread, Eggs)
580 0.026471 (White_Bread, Sweet_Relish, Eggs)
581 0.025735 (Toilet_Paper, Eggs, White_Bread)
582 0.026471 (Tomatoes, White_Bread, Eggs)
583 0.032353 (Toothpaste, White_Bread, Eggs)
584 0.030147 (Wheat_Bread, Eggs, White_Bread)
600 0.026471 (2pct_Milk, Potato_Chips, White_Bread, Eggs)
601 0.025735 (2pct_Milk, Toothpaste, White_Bread, Eggs)

Extraction des règles¶

In [26]:
#fonction de calcul des règles
from mlxtend.frequent_patterns import association_rules

#génération des règles à partir des itemsets fréquents
the_rules = association_rules(freq_itemsets,metric="confidence",min_threshold=0.75)

#type de l'objet renvoyé
print(type(the_rules))
<class 'pandas.core.frame.DataFrame'>
In [27]:
#info
the_rules.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         50 non-null     object 
 1   consequents         50 non-null     object 
 2   antecedent support  50 non-null     float64
 3   consequent support  50 non-null     float64
 4   support             50 non-null     float64
 5   confidence          50 non-null     float64
 6   lift                50 non-null     float64
 7   representativity    50 non-null     float64
 8   leverage            50 non-null     float64
 9   conviction          50 non-null     float64
 10  zhangs_metric       50 non-null     float64
 11  jaccard             50 non-null     float64
 12  certainty           50 non-null     float64
 13  kulczynski          50 non-null     float64
dtypes: float64(12), object(2)
memory usage: 5.6+ KB
In [28]:
#récupérer une partie des infos
regles = the_rules[['antecedents','consequents','support','confidence','lift']]
regles.head()
Out[28]:
antecedents consequents support confidence lift
0 (2pct_Milk, Aspirin) (White_Bread) 0.027206 0.787234 6.608878
1 (2pct_Milk, Bananas) (White_Bread) 0.025735 0.813953 6.833190
2 (White_Bread, Bananas) (2pct_Milk) 0.025735 0.795455 7.260525
3 (Wheat_Bread, Cola) (2pct_Milk) 0.025735 0.795455 7.260525
4 (2pct_Milk, Popcorn_Salt) (Eggs) 0.027206 0.822222 6.695941
In [29]:
#trier les règles par lift décroissant - 5 premières
regles.sort_values(by='lift',ascending=False).head(5)
Out[29]:
antecedents consequents support confidence lift
48 (2pct_Milk, Potato_Chips, White_Bread) (Toothpaste) 0.025000 0.755556 9.514403
29 (Sweet_Relish, Hot_Dog_Buns) (Hot_Dogs) 0.030147 0.836735 9.031422
16 (Hamburger_Buns, White_Bread) (98pct_Fat_Free_Hamburger) 0.026471 0.765957 8.202379
47 (2pct_Milk, Toothpaste, White_Bread) (Potato_Chips) 0.025000 0.755556 7.725982
8 (Onions, Wheat_Bread) (2pct_Milk) 0.028676 0.829787 7.573897
In [30]:
#extraire les règles menant à "Eggs"
regles.loc[regles.consequents.eq({'Eggs'})]
Out[30]:
antecedents consequents support confidence lift
4 (2pct_Milk, Popcorn_Salt) (Eggs) 0.027206 0.822222 6.695941
5 (2pct_Milk, Potato_Chips) (Eggs) 0.033824 0.754098 6.141160
6 (2pct_Milk, Tomatoes) (Eggs) 0.025735 0.795455 6.477953
15 (98pct_Fat_Free_Hamburger, Potatoes) (Eggs) 0.025000 0.772727 6.292869
23 (Hot_Dogs, Potatoes) (Eggs) 0.025000 0.772727 6.292869
24 (White_Bread, Pepperoni_Pizza_-_Frozen) (Eggs) 0.025735 0.760870 6.196303
25 (Popcorn_Salt, Potatoes) (Eggs) 0.025000 0.809524 6.592529
26 (Popcorn_Salt, White_Bread) (Eggs) 0.027206 0.770833 6.277445
27 (Sweet_Relish, Potatoes) (Eggs) 0.028676 0.795918 6.481730
28 (Sugar_Cookies, White_Bread) (Eggs) 0.027206 0.770833 6.277445
40 (2pct_Milk, Potato_Chips, White_Bread) (Eggs) 0.026471 0.800000 6.514970
43 (2pct_Milk, Toothpaste, White_Bread) (Eggs) 0.025735 0.777778 6.333999
In [31]:
#extraire les règles menant à "Eggs"
#et incluant '2pct_Milk' dans son antécédent
regles.loc[regles.consequents.eq({'Eggs'}) & regles.antecedents.ge({'2pct_Milk'})]
Out[31]:
antecedents consequents support confidence lift
4 (2pct_Milk, Popcorn_Salt) (Eggs) 0.027206 0.822222 6.695941
5 (2pct_Milk, Potato_Chips) (Eggs) 0.033824 0.754098 6.141160
6 (2pct_Milk, Tomatoes) (Eggs) 0.025735 0.795455 6.477953
40 (2pct_Milk, Potato_Chips, White_Bread) (Eggs) 0.026471 0.800000 6.514970
43 (2pct_Milk, Toothpaste, White_Bread) (Eggs) 0.025735 0.777778 6.333999

Autres algos pour les itemsets fréquents¶

FPGROWTH¶

In [32]:
#toujours à partir du tableau booléen
#importation de la fonction fpgrowth
from mlxtend.frequent_patterns import fpgrowth
#itemsets frequents
freq_growth = fpgrowth(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame, dimension
freq_growth.shape
Out[32]:
(603, 2)
In [33]:
#premiers itemsets
freq_growth.head(10)
Out[33]:
support itemsets
0 0.033824 (Peaches)
1 0.040441 (Vegetable_Oil)
2 0.030882 (Frozen_Corn)
3 0.055882 (Plums)
4 0.057353 (Pancake_Mix)
5 0.029412 (Cheese)
6 0.030882 (Cauliflower)
7 0.109559 (2pct_Milk)
8 0.119118 (White_Bread)
9 0.097794 (Potato_Chips)
In [34]:
#incluant White_Bread et Eggs
#(White_Bread, Eggs) seuls en font partie
freq_growth.loc[freq_growth.itemsets.ge({'White_Bread','Eggs'})]
Out[34]:
support itemsets
219 0.036029 (2pct_Milk, White_Bread, Eggs)
220 0.055147 (White_Bread, Eggs)
224 0.033824 (Potato_Chips, White_Bread, Eggs)
227 0.026471 (2pct_Milk, Potato_Chips, White_Bread, Eggs)
234 0.030147 (White_Bread, 98pct_Fat_Free_Hamburger, Eggs)
257 0.025735 (Toilet_Paper, Eggs, White_Bread)
271 0.027206 (Onions, White_Bread, Eggs)
296 0.027206 (Sugar_Cookies, White_Bread, Eggs)
305 0.028676 (Hot_Dogs, White_Bread, Eggs)
330 0.027206 (Popcorn_Salt, White_Bread, Eggs)
360 0.029412 (Aspirin, White_Bread, Eggs)
387 0.030882 (White_Bread, Eggs, Potatoes)
403 0.026471 (White_Bread, Sweet_Relish, Eggs)
418 0.032353 (Toothpaste, White_Bread, Eggs)
429 0.025735 (Toothpaste, 2pct_Milk, White_Bread, Eggs)
444 0.026471 (Tomatoes, White_Bread, Eggs)
476 0.030147 (White_Bread, Eggs, Cola)
492 0.025735 (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs)
533 0.030147 (Wheat_Bread, Eggs, White_Bread)

FPMAX¶

In [35]:
#toujours à partir du tableau booléen
#importation de la fonction fpgrowth
from mlxtend.frequent_patterns import fpmax
#itemsets frequents
freq_max = fpmax(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame, dimension
freq_max.shape
Out[35]:
(462, 2)
In [36]:
#premiers itemsets
freq_max.head(10)
Out[36]:
support itemsets
0 0.025 (Buttermilk)
1 0.025 (Mouthwash)
2 0.025 (Ice_Cream_Sandwich)
3 0.025 (Imported_Beer)
4 0.025 (Mushroom_Pizza_-_Frozen)
5 0.025 (Merlot_Wine)
6 0.025 (Mixed_Nuts)
7 0.025 (Tangerines)
8 0.025 (Turkey_Noodle_Soup)
9 0.025 (White_Wine)
In [37]:
#incluant White_Bread et Eggs
#(White_Bread, Eggs) seuls n'en font plus partie
freq_max.loc[freq_max.itemsets.ge({'White_Bread','Eggs'})]
Out[37]:
support itemsets
213 0.027206 (Sugar_Cookies, White_Bread, Eggs)
304 0.027206 (Popcorn_Salt, White_Bread, Eggs)
327 0.026471 (Tomatoes, White_Bread, Eggs)
344 0.029412 (Aspirin, White_Bread, Eggs)
359 0.025735 (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs)
384 0.025735 (Toilet_Paper, Eggs, White_Bread)
398 0.030147 (Wheat_Bread, Eggs, White_Bread)
411 0.030147 (White_Bread, Eggs, Cola)
423 0.025735 (2pct_Milk, Toothpaste, White_Bread, Eggs)
431 0.027206 (Onions, White_Bread, Eggs)
438 0.026471 (White_Bread, Sweet_Relish, Eggs)
449 0.030882 (White_Bread, Eggs, Potatoes)
454 0.028676 (Hot_Dogs, White_Bread, Eggs)
460 0.030147 (White_Bread, 98pct_Fat_Free_Hamburger, Eggs)
461 0.026471 (2pct_Milk, Potato_Chips, White_Bread, Eggs)