Skip to content

Instantly share code, notes, and snippets.

@htnminh
Last active April 18, 2023 10:45
Show Gist options
  • Save htnminh/b9d0cf7db88b3c28a4a06658b8237d35 to your computer and use it in GitHub Desktop.
Save htnminh/b9d0cf7db88b3c28a4a06658b8237d35 to your computer and use it in GitHub Desktop.
apriori VDT 2 - 31 Hoang Tran Nhat Minh
import pprint
import pandas as pd
PATH_TO_CSV = r"C:\Users\nhatm\OneDrive - Hanoi University of Science and Technology\Desktop\groceries.csv"
MIN_SUPPORT = 12
def get_frequent_itemsets(transactions, min_support):
# Count the occurrence of each item
item_counts = {}
for transaction in transactions:
for item in transaction:
if item in item_counts:
item_counts[item] += 1
else:
item_counts[item] = 1
# Discard infrequent items
frequent_items = {frozenset([item]) for item, count in item_counts.items()
if count >= min_support}
# Generate candidate itemsets
itemsets = frequent_items.copy()
k = 2
while True:
candidate_itemsets = set([itemset1.union(itemset2)
for itemset1 in itemsets
for itemset2 in itemsets
if len(itemset1.union(itemset2)) == k])
if not candidate_itemsets:
break
# Count the occurrence of each candidate itemset
itemset_counts = dict.fromkeys(candidate_itemsets, 0)
for transaction in transactions:
for itemset in candidate_itemsets:
if itemset.issubset(transaction):
itemset_counts[itemset] += 1
# Discard infrequent itemsets
frequent_itemsets = {itemset for itemset, count in itemset_counts.items()
if count >= min_support}
if not frequent_itemsets:
break
# Add the frequent itemsets to the output
frequent_items.update(frequent_itemsets)
itemsets = frequent_itemsets
k += 1
return frequent_items
# Test only
pprint.pprint(
get_frequent_itemsets(
[
{'A', 'B', 'D', 'E'},
{'B', 'C', 'E'},
{'A', 'B', 'D', 'E'},
{'A', 'B', 'C', 'E'},
{'A', 'B', 'C', 'D', 'E'},
{'B', 'C', 'D'}
],
3
)
)
print()
# Main program
df = pd.read_csv(PATH_TO_CSV, header=0, index_col=False)
print(df.head())
# Convert the dataframe to a set of transactions
transactions = []
for i in range(df.shape[0]):
transactions.append(set(df.iloc[i, 1: df.iloc[i, 0] + 1]))
pprint.pprint(transactions[0:5])
print()
print(f'For min_support={MIN_SUPPORT}:')
pprint.pprint(get_frequent_itemsets(transactions, MIN_SUPPORT))
"""Output:
{frozenset({'A', 'E', 'B'}),
frozenset({'E', 'B', 'C'}),
frozenset({'D'}),
frozenset({'A'}),
frozenset({'A', 'D'}),
frozenset({'B'}),
frozenset({'A', 'B'}),
frozenset({'B', 'D'}),
frozenset({'A', 'D', 'B'}),
frozenset({'E'}),
frozenset({'E', 'C'}),
frozenset({'E', 'B'}),
frozenset({'E', 'D'}),
frozenset({'E', 'B', 'D'}),
frozenset({'C'}),
frozenset({'B', 'C'}),
frozenset({'E', 'A'}),
frozenset({'A', 'E', 'D'}),
frozenset({'A', 'B', 'E', 'D'})}
Item(s) Item 1 Item 2 Item 3 Item 4 Item 5 Item 6 Item 7 Item 8 Item 9 ... Item 23 Item 24 Item 25 Item 26 Item 27 Item 28 Item 29 Item 30 Item 31 Item 32
0 4 citrus fruit semi-finished bread margarine ready soups NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 3 tropical fruit yogurt coffee NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1 whole milk NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 4 pip fruit yogurt cream cheese meat spreads NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 4 other vegetables whole milk condensed milk long life bakery product NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
[5 rows x 33 columns]
[{'citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'},
{'tropical fruit', 'yogurt', 'coffee'},
{'whole milk'},
{'meat spreads', 'yogurt', 'cream cheese', 'pip fruit'},
{'condensed milk',
'long life bakery product',
'other vegetables',
'whole milk'}]
For min_support=12:
{frozenset({'rolls/buns', 'whole milk'}),
frozenset({'citrus fruit'}),
frozenset({'curd'}),
frozenset({'shopping bags'}),
frozenset({'bottled water'}),
frozenset({'whole milk'}),
frozenset({'fruit/vegetable juice'}),
frozenset({'yogurt'}),
frozenset({'coffee'}),
frozenset({'frankfurter'}),
frozenset({'root vegetables'}),
frozenset({'tropical fruit'}),
frozenset({'soda'}),
frozenset({'sugar'}),
frozenset({'newspapers'}),
frozenset({'rolls/buns'}),
frozenset({'pastry'}),
frozenset({'canned beer'}),
frozenset({'bottled beer'}),
frozenset({'sausage'}),
frozenset({'other vegetables'}),
frozenset({'whole milk', 'other vegetables'})}
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment