Version¶
In [1]:
#import
import mlxtend
mlxtend.__version__
Out[1]:
'0.23.4'
Importation et inspection des données¶
In [2]:
#dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement
import pandas
D = pandas.read_excel("market_basket.xlsx")
D.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12935 entries, 0 to 12934 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Panier 12935 non-null int64 1 Produit 12935 non-null object dtypes: int64(1), object(1) memory usage: 202.2+ KB
In [3]:
#produits par panier
freqPanier = D.Panier.value_counts(sort=True)
freqPanier
Out[3]:
Panier 650 303 767 120 154 120 810 113 492 109 ... 1358 1 1356 1 1355 1 1354 1 1353 1 Name: count, Length: 1360, dtype: int64
In [4]:
#en graphique - les 10 les plus fournis
import matplotlib.pyplot as plt
plt.bar(x=[str(i) for i in freqPanier.index[:10]],height=freqPanier[:10])
Out[4]:
<BarContainer object of 10 artists>
In [5]:
#fréquence des produits
freqProd = D.Produit.value_counts()
freqProd
Out[5]:
Produit Eggs 167 White_Bread 162 2pct_Milk 149 Potato_Chips 133 98pct_Fat_Free_Hamburger 127 ... Daily_Newspaper 8 Plain_English_Muffins 8 Brown_Sugar_Grits 8 Oats_and_Nuts_Cereal 7 Celery 7 Name: count, Length: 303, dtype: int64
In [6]:
#les 10 premiers
import numpy
plt.barh(numpy.flip(freqProd.index[:10]),width=numpy.flip(freqProd[:10]))
Out[6]:
<BarContainer object of 10 artists>
Itemsets fréquents - A priori¶
Préparation des données¶
In [7]:
#préparation des données - tableau binaire (de booléens)
TB = pandas.crosstab(D.Panier,D.Produit).astype(bool)
type(TB)
Out[7]:
pandas.core.frame.DataFrame
In [8]:
#dimensions
TB.shape
Out[8]:
(1360, 303)
In [9]:
#qqs. valeurs
TB.iloc[:10,:10]
Out[9]:
Produit | 100_Watt_Lightbulb | 2pct_Milk | 40_Watt_Lightbulb | 60_Watt_Lightbulb | 75_Watt_Lightbulb | 98pct_Fat_Free_Hamburger | AA_Cell_Batteries | Apple_Cinnamon_Waffles | Apple_Drink | Apple_Fruit_Roll |
---|---|---|---|---|---|---|---|---|---|---|
Panier | ||||||||||
1 | False | False | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False |
5 | False | False | False | False | False | False | False | False | False | False |
6 | False | False | False | False | False | False | False | False | False | False |
7 | False | True | False | False | False | False | False | False | False | False |
8 | False | False | False | False | False | True | False | False | False | False |
9 | False | False | False | False | False | False | False | False | False | False |
10 | False | False | False | False | False | False | False | False | False | False |
In [10]:
#vérification - paniers
TB.sum(axis=1).sort_values(ascending=False)
Out[10]:
Panier 650 303 767 120 154 120 810 113 492 109 ... 1358 1 1356 1 1355 1 1354 1 1353 1 Length: 1360, dtype: int64
In [11]:
#vérification - produits
TB.sum(axis=0).sort_values(ascending=False)
Out[11]:
Produit Eggs 167 White_Bread 162 2pct_Milk 149 Potato_Chips 133 98pct_Fat_Free_Hamburger 127 ... Daily_Newspaper 8 Brown_Sugar_Grits 8 Chicken_Legs 8 Oats_and_Nuts_Cereal 7 Celery 7 Length: 303, dtype: int64
Extraction des itemsets fréquents¶
In [12]:
#importation de la fonction apriori
from mlxtend.frequent_patterns import apriori
#itemsets frequents
freq_itemsets = apriori(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame
type(freq_itemsets)
Out[12]:
pandas.core.frame.DataFrame
In [13]:
#colonnes
freq_itemsets.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 603 entries, 0 to 602 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 support 603 non-null float64 1 itemsets 603 non-null object dtypes: float64(1), object(1) memory usage: 9.6+ KB
In [14]:
#premières lignes
freq_itemsets.head(10)
Out[14]:
support | itemsets | |
---|---|---|
0 | 0.030147 | (100_Watt_Lightbulb) |
1 | 0.109559 | (2pct_Milk) |
2 | 0.037500 | (60_Watt_Lightbulb) |
3 | 0.031618 | (75_Watt_Lightbulb) |
4 | 0.093382 | (98pct_Fat_Free_Hamburger) |
5 | 0.031618 | (AA_Cell_Batteries) |
6 | 0.025735 | (Apple_Cinnamon_Waffles) |
7 | 0.026471 | (Apple_Drink) |
8 | 0.031618 | (Apple_Fruit_Roll) |
9 | 0.032353 | (Apple_Jam) |
Post-traitement - Filtrage...¶
In [15]:
#type de l'itemsets
type(freq_itemsets.itemsets)
Out[15]:
pandas.core.series.Series
In [16]:
#affichage du premier itemset
freq_itemsets.itemsets.iloc[0]
Out[16]:
frozenset({'100_Watt_Lightbulb'})
In [17]:
#type des valeurs de l'itemsets
type(freq_itemsets.itemsets.iloc[0])
Out[17]:
frozenset
In [18]:
#cardinal d'un itemset
len(freq_itemsets.itemsets.iloc[0])
Out[18]:
1
... Selon le cardinal
In [19]:
#cardinal des itemsets
freq_itemsets.itemsets.apply(lambda x: len(x)).value_counts()
Out[19]:
itemsets 2 300 1 206 3 94 4 3 Name: count, dtype: int64
In [20]:
#itemsets de cardinal == 3
freq_itemsets.loc[freq_itemsets.itemsets.apply(lambda x: len(x) == 3)]
Out[20]:
support | itemsets | |
---|---|---|
506 | 0.027206 | (2pct_Milk, 98pct_Fat_Free_Hamburger, Eggs) |
507 | 0.027206 | (2pct_Milk, Potato_Chips, 98pct_Fat_Free_Hambu... |
508 | 0.026471 | (2pct_Milk, White_Bread, 98pct_Fat_Free_Hambur... |
509 | 0.025735 | (2pct_Milk, Aspirin, Eggs) |
510 | 0.025000 | (2pct_Milk, Aspirin, Potato_Chips) |
... | ... | ... |
595 | 0.025735 | (Toothpaste, White_Bread, Potatoes) |
596 | 0.027941 | (Toothpaste, White_Bread, Sweet_Relish) |
597 | 0.025735 | (Toothpaste, Toilet_Paper, White_Bread) |
598 | 0.025735 | (Toothpaste, White_Bread, Tomatoes) |
599 | 0.026471 | (Toothpaste, Wheat_Bread, White_Bread) |
94 rows × 2 columns
... Selon l'apparition des items
In [21]:
#itemset avec uniquement l'item "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.eq({'White_Bread'})]
Out[21]:
support | itemsets | |
---|---|---|
199 | 0.119118 | (White_Bread) |
In [22]:
#itemsets incluant l'item "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread'})]
Out[22]:
support | itemsets | |
---|---|---|
199 | 0.119118 | (White_Bread) |
248 | 0.051471 | (2pct_Milk, White_Bread) |
267 | 0.049265 | (White_Bread, 98pct_Fat_Free_Hamburger) |
271 | 0.029412 | (Apples, White_Bread) |
286 | 0.041912 | (Aspirin, White_Bread) |
... | ... | ... |
598 | 0.025735 | (Toothpaste, White_Bread, Tomatoes) |
599 | 0.026471 | (Toothpaste, Wheat_Bread, White_Bread) |
600 | 0.026471 | (2pct_Milk, Potato_Chips, White_Bread, Eggs) |
601 | 0.025735 | (2pct_Milk, Toothpaste, White_Bread, Eggs) |
602 | 0.025000 | (2pct_Milk, Toothpaste, Potato_Chips, White_Br... |
103 rows × 2 columns
In [23]:
#soit...
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread'})].shape
Out[23]:
(103, 2)
In [24]:
#itemsets incluant "White_Bread" et "Eggs"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'White_Bread','Eggs'})]
Out[24]:
support | itemsets | |
---|---|---|
378 | 0.055147 | (White_Bread, Eggs) |
527 | 0.036029 | (2pct_Milk, White_Bread, Eggs) |
548 | 0.030147 | (White_Bread, 98pct_Fat_Free_Hamburger, Eggs) |
553 | 0.029412 | (Aspirin, White_Bread, Eggs) |
558 | 0.030147 | (White_Bread, Eggs, Cola) |
564 | 0.028676 | (Hot_Dogs, White_Bread, Eggs) |
567 | 0.027206 | (Onions, White_Bread, Eggs) |
568 | 0.025735 | (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs) |
570 | 0.027206 | (Popcorn_Salt, White_Bread, Eggs) |
575 | 0.033824 | (Potato_Chips, White_Bread, Eggs) |
577 | 0.030882 | (White_Bread, Eggs, Potatoes) |
578 | 0.027206 | (Sugar_Cookies, White_Bread, Eggs) |
580 | 0.026471 | (White_Bread, Sweet_Relish, Eggs) |
581 | 0.025735 | (Toilet_Paper, Eggs, White_Bread) |
582 | 0.026471 | (Tomatoes, White_Bread, Eggs) |
583 | 0.032353 | (Toothpaste, White_Bread, Eggs) |
584 | 0.030147 | (Wheat_Bread, Eggs, White_Bread) |
600 | 0.026471 | (2pct_Milk, Potato_Chips, White_Bread, Eggs) |
601 | 0.025735 | (2pct_Milk, Toothpaste, White_Bread, Eggs) |
In [25]:
#peu importe l'ordre ici : "Eggs" et "White_Bread"
freq_itemsets.loc[freq_itemsets.itemsets.ge({'Eggs','White_Bread'})]
Out[25]:
support | itemsets | |
---|---|---|
378 | 0.055147 | (White_Bread, Eggs) |
527 | 0.036029 | (2pct_Milk, White_Bread, Eggs) |
548 | 0.030147 | (White_Bread, 98pct_Fat_Free_Hamburger, Eggs) |
553 | 0.029412 | (Aspirin, White_Bread, Eggs) |
558 | 0.030147 | (White_Bread, Eggs, Cola) |
564 | 0.028676 | (Hot_Dogs, White_Bread, Eggs) |
567 | 0.027206 | (Onions, White_Bread, Eggs) |
568 | 0.025735 | (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs) |
570 | 0.027206 | (Popcorn_Salt, White_Bread, Eggs) |
575 | 0.033824 | (Potato_Chips, White_Bread, Eggs) |
577 | 0.030882 | (White_Bread, Eggs, Potatoes) |
578 | 0.027206 | (Sugar_Cookies, White_Bread, Eggs) |
580 | 0.026471 | (White_Bread, Sweet_Relish, Eggs) |
581 | 0.025735 | (Toilet_Paper, Eggs, White_Bread) |
582 | 0.026471 | (Tomatoes, White_Bread, Eggs) |
583 | 0.032353 | (Toothpaste, White_Bread, Eggs) |
584 | 0.030147 | (Wheat_Bread, Eggs, White_Bread) |
600 | 0.026471 | (2pct_Milk, Potato_Chips, White_Bread, Eggs) |
601 | 0.025735 | (2pct_Milk, Toothpaste, White_Bread, Eggs) |
Extraction des règles¶
In [26]:
#fonction de calcul des règles
from mlxtend.frequent_patterns import association_rules
#génération des règles à partir des itemsets fréquents
the_rules = association_rules(freq_itemsets,metric="confidence",min_threshold=0.75)
#type de l'objet renvoyé
print(type(the_rules))
<class 'pandas.core.frame.DataFrame'>
In [27]:
#info
the_rules.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 antecedents 50 non-null object 1 consequents 50 non-null object 2 antecedent support 50 non-null float64 3 consequent support 50 non-null float64 4 support 50 non-null float64 5 confidence 50 non-null float64 6 lift 50 non-null float64 7 representativity 50 non-null float64 8 leverage 50 non-null float64 9 conviction 50 non-null float64 10 zhangs_metric 50 non-null float64 11 jaccard 50 non-null float64 12 certainty 50 non-null float64 13 kulczynski 50 non-null float64 dtypes: float64(12), object(2) memory usage: 5.6+ KB
In [28]:
#récupérer une partie des infos
regles = the_rules[['antecedents','consequents','support','confidence','lift']]
regles.head()
Out[28]:
antecedents | consequents | support | confidence | lift | |
---|---|---|---|---|---|
0 | (2pct_Milk, Aspirin) | (White_Bread) | 0.027206 | 0.787234 | 6.608878 |
1 | (2pct_Milk, Bananas) | (White_Bread) | 0.025735 | 0.813953 | 6.833190 |
2 | (White_Bread, Bananas) | (2pct_Milk) | 0.025735 | 0.795455 | 7.260525 |
3 | (Wheat_Bread, Cola) | (2pct_Milk) | 0.025735 | 0.795455 | 7.260525 |
4 | (2pct_Milk, Popcorn_Salt) | (Eggs) | 0.027206 | 0.822222 | 6.695941 |
In [29]:
#trier les règles par lift décroissant - 5 premières
regles.sort_values(by='lift',ascending=False).head(5)
Out[29]:
antecedents | consequents | support | confidence | lift | |
---|---|---|---|---|---|
48 | (2pct_Milk, Potato_Chips, White_Bread) | (Toothpaste) | 0.025000 | 0.755556 | 9.514403 |
29 | (Sweet_Relish, Hot_Dog_Buns) | (Hot_Dogs) | 0.030147 | 0.836735 | 9.031422 |
16 | (Hamburger_Buns, White_Bread) | (98pct_Fat_Free_Hamburger) | 0.026471 | 0.765957 | 8.202379 |
47 | (2pct_Milk, Toothpaste, White_Bread) | (Potato_Chips) | 0.025000 | 0.755556 | 7.725982 |
8 | (Onions, Wheat_Bread) | (2pct_Milk) | 0.028676 | 0.829787 | 7.573897 |
In [30]:
#extraire les règles menant à "Eggs"
regles.loc[regles.consequents.eq({'Eggs'})]
Out[30]:
antecedents | consequents | support | confidence | lift | |
---|---|---|---|---|---|
4 | (2pct_Milk, Popcorn_Salt) | (Eggs) | 0.027206 | 0.822222 | 6.695941 |
5 | (2pct_Milk, Potato_Chips) | (Eggs) | 0.033824 | 0.754098 | 6.141160 |
6 | (2pct_Milk, Tomatoes) | (Eggs) | 0.025735 | 0.795455 | 6.477953 |
15 | (98pct_Fat_Free_Hamburger, Potatoes) | (Eggs) | 0.025000 | 0.772727 | 6.292869 |
23 | (Hot_Dogs, Potatoes) | (Eggs) | 0.025000 | 0.772727 | 6.292869 |
24 | (White_Bread, Pepperoni_Pizza_-_Frozen) | (Eggs) | 0.025735 | 0.760870 | 6.196303 |
25 | (Popcorn_Salt, Potatoes) | (Eggs) | 0.025000 | 0.809524 | 6.592529 |
26 | (Popcorn_Salt, White_Bread) | (Eggs) | 0.027206 | 0.770833 | 6.277445 |
27 | (Sweet_Relish, Potatoes) | (Eggs) | 0.028676 | 0.795918 | 6.481730 |
28 | (Sugar_Cookies, White_Bread) | (Eggs) | 0.027206 | 0.770833 | 6.277445 |
40 | (2pct_Milk, Potato_Chips, White_Bread) | (Eggs) | 0.026471 | 0.800000 | 6.514970 |
43 | (2pct_Milk, Toothpaste, White_Bread) | (Eggs) | 0.025735 | 0.777778 | 6.333999 |
In [31]:
#extraire les règles menant à "Eggs"
#et incluant '2pct_Milk' dans son antécédent
regles.loc[regles.consequents.eq({'Eggs'}) & regles.antecedents.ge({'2pct_Milk'})]
Out[31]:
antecedents | consequents | support | confidence | lift | |
---|---|---|---|---|---|
4 | (2pct_Milk, Popcorn_Salt) | (Eggs) | 0.027206 | 0.822222 | 6.695941 |
5 | (2pct_Milk, Potato_Chips) | (Eggs) | 0.033824 | 0.754098 | 6.141160 |
6 | (2pct_Milk, Tomatoes) | (Eggs) | 0.025735 | 0.795455 | 6.477953 |
40 | (2pct_Milk, Potato_Chips, White_Bread) | (Eggs) | 0.026471 | 0.800000 | 6.514970 |
43 | (2pct_Milk, Toothpaste, White_Bread) | (Eggs) | 0.025735 | 0.777778 | 6.333999 |
Autres algos pour les itemsets fréquents¶
FPGROWTH¶
In [32]:
#toujours à partir du tableau booléen
#importation de la fonction fpgrowth
from mlxtend.frequent_patterns import fpgrowth
#itemsets frequents
freq_growth = fpgrowth(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame, dimension
freq_growth.shape
Out[32]:
(603, 2)
In [33]:
#premiers itemsets
freq_growth.head(10)
Out[33]:
support | itemsets | |
---|---|---|
0 | 0.033824 | (Peaches) |
1 | 0.040441 | (Vegetable_Oil) |
2 | 0.030882 | (Frozen_Corn) |
3 | 0.055882 | (Plums) |
4 | 0.057353 | (Pancake_Mix) |
5 | 0.029412 | (Cheese) |
6 | 0.030882 | (Cauliflower) |
7 | 0.109559 | (2pct_Milk) |
8 | 0.119118 | (White_Bread) |
9 | 0.097794 | (Potato_Chips) |
In [34]:
#incluant White_Bread et Eggs
#(White_Bread, Eggs) seuls en font partie
freq_growth.loc[freq_growth.itemsets.ge({'White_Bread','Eggs'})]
Out[34]:
support | itemsets | |
---|---|---|
219 | 0.036029 | (2pct_Milk, White_Bread, Eggs) |
220 | 0.055147 | (White_Bread, Eggs) |
224 | 0.033824 | (Potato_Chips, White_Bread, Eggs) |
227 | 0.026471 | (2pct_Milk, Potato_Chips, White_Bread, Eggs) |
234 | 0.030147 | (White_Bread, 98pct_Fat_Free_Hamburger, Eggs) |
257 | 0.025735 | (Toilet_Paper, Eggs, White_Bread) |
271 | 0.027206 | (Onions, White_Bread, Eggs) |
296 | 0.027206 | (Sugar_Cookies, White_Bread, Eggs) |
305 | 0.028676 | (Hot_Dogs, White_Bread, Eggs) |
330 | 0.027206 | (Popcorn_Salt, White_Bread, Eggs) |
360 | 0.029412 | (Aspirin, White_Bread, Eggs) |
387 | 0.030882 | (White_Bread, Eggs, Potatoes) |
403 | 0.026471 | (White_Bread, Sweet_Relish, Eggs) |
418 | 0.032353 | (Toothpaste, White_Bread, Eggs) |
429 | 0.025735 | (Toothpaste, 2pct_Milk, White_Bread, Eggs) |
444 | 0.026471 | (Tomatoes, White_Bread, Eggs) |
476 | 0.030147 | (White_Bread, Eggs, Cola) |
492 | 0.025735 | (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs) |
533 | 0.030147 | (Wheat_Bread, Eggs, White_Bread) |
FPMAX¶
In [35]:
#toujours à partir du tableau booléen
#importation de la fonction fpgrowth
from mlxtend.frequent_patterns import fpmax
#itemsets frequents
freq_max = fpmax(TB,min_support=0.025,max_len=4,use_colnames=True)
#type -> pandas DataFrame, dimension
freq_max.shape
Out[35]:
(462, 2)
In [36]:
#premiers itemsets
freq_max.head(10)
Out[36]:
support | itemsets | |
---|---|---|
0 | 0.025 | (Buttermilk) |
1 | 0.025 | (Mouthwash) |
2 | 0.025 | (Ice_Cream_Sandwich) |
3 | 0.025 | (Imported_Beer) |
4 | 0.025 | (Mushroom_Pizza_-_Frozen) |
5 | 0.025 | (Merlot_Wine) |
6 | 0.025 | (Mixed_Nuts) |
7 | 0.025 | (Tangerines) |
8 | 0.025 | (Turkey_Noodle_Soup) |
9 | 0.025 | (White_Wine) |
In [37]:
#incluant White_Bread et Eggs
#(White_Bread, Eggs) seuls n'en font plus partie
freq_max.loc[freq_max.itemsets.ge({'White_Bread','Eggs'})]
Out[37]:
support | itemsets | |
---|---|---|
213 | 0.027206 | (Sugar_Cookies, White_Bread, Eggs) |
304 | 0.027206 | (Popcorn_Salt, White_Bread, Eggs) |
327 | 0.026471 | (Tomatoes, White_Bread, Eggs) |
344 | 0.029412 | (Aspirin, White_Bread, Eggs) |
359 | 0.025735 | (White_Bread, Pepperoni_Pizza_-_Frozen, Eggs) |
384 | 0.025735 | (Toilet_Paper, Eggs, White_Bread) |
398 | 0.030147 | (Wheat_Bread, Eggs, White_Bread) |
411 | 0.030147 | (White_Bread, Eggs, Cola) |
423 | 0.025735 | (2pct_Milk, Toothpaste, White_Bread, Eggs) |
431 | 0.027206 | (Onions, White_Bread, Eggs) |
438 | 0.026471 | (White_Bread, Sweet_Relish, Eggs) |
449 | 0.030882 | (White_Bread, Eggs, Potatoes) |
454 | 0.028676 | (Hot_Dogs, White_Bread, Eggs) |
460 | 0.030147 | (White_Bread, 98pct_Fat_Free_Hamburger, Eggs) |
461 | 0.026471 | (2pct_Milk, Potato_Chips, White_Bread, Eggs) |