Skip to content

Instantly share code, notes, and snippets.

@Renien
Created May 15, 2016 13:40
Show Gist options
  • Save Renien/30dbb77ca85e4b0ec225c1874a8b37da to your computer and use it in GitHub Desktop.
Save Renien/30dbb77ca85e4b0ec225c1874a8b37da to your computer and use it in GitHub Desktop.
We can find all frequent pairs by making two passes over the baskets. On the first pass, we count the items themselves, and then determine which items are frequent. On the second pass, we count only the pairs of items both of which are found frequently on the first pass. Monotonicity justifies our ignoring other pairs.
__author__ = 'renienj'
import urllib
from itertools import combinations
import time
from collections import defaultdict
"""
Sample Data Sets
----------------
FRO11987 ELE17451 ELE89019 SNA90258 GRO99222
GRO99222 GRO12298 FRO12685 ELE91550 SNA11465 ELE26917 ELE52966 FRO90334 SNA30755 ELE17451 FRO84225 SNA80192
ELE17451 GRO73461 DAI22896 SNA99873 FRO86643
ELE17451 ELE37798 FRO86643 GRO56989 ELE23393 SNA11465
"""
def main(file_location, support, itemset_size):
"""
file_location: The document location
support: The frequency count
itemset_size: The combine item size
"""
candidate_dct = defaultdict(lambda: 0)
for i in range(itemset_size):
now = time.time()
candidate_dct = data_pass(file_location, support, pass_nbr=i + 1, candidate_dct=candidate_dct)
print "pass number %i took %f and found %i candidates" % (i + 1, time.time() - now, len(candidate_dct))
return candidate_dct
def update_candidates(item_lst, candidate_dct, pass_nbr):
"""
During the iteration the data collection update
the candidate dictionary according to the current pass_nbr.
- item_lst: Current list
- candidate_dct: The list of generated combine set
- pass_nbr: The pass # number
"""
if pass_nbr == 1:
for item in item_lst:
candidate_dct[(item,)] += 1
else:
frequent_items_set = set()
for item_tuple in combinations(sorted(item_lst), pass_nbr - 1):
if item_tuple in candidate_dct:
frequent_items_set.update(item_tuple)
for item_set in combinations(sorted(frequent_items_set), pass_nbr):
candidate_dct[item_set] += 1
return candidate_dct
def clear_items(candidate_dct, support, pass_nbr):
"""
This function will help to clear the candidate_dct object
according to the following associate rules
- support: The frequency of the sets
- pass_nbr: The pass # number
"""
for item_tuple, cnt in candidate_dct.items():
if cnt < support or len(item_tuple) < pass_nbr:
del candidate_dct[item_tuple]
return candidate_dct
def data_pass(file_location, support, pass_nbr, candidate_dct):
"""
Generate the datapass according to the pass count
file_location: The document location
support: The frequency count
pass_nbr: The current pass count
candidate_dct: The list of generated combine set
"""
with open(file_location, 'r') as f:
for line in f:
item_lst = line.split()
candidate_dct = update_candidates(item_lst, candidate_dct, pass_nbr)
candidate_dct = clear_items(candidate_dct, support, pass_nbr)
return candidate_dct
if __name__ == "__main__":
file_location, headers = urllib.urlretrieve("http://goo.gl/w8nHnz")
support = 200
itemset_size = 3
itemsets_dct = main(file_location, support, itemset_size)
# Print the 1st 10 items
i = 0
for itemset, frequency in itemsets_dct.iteritems():
print itemset, frequency
i += 1
if i == 10:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment