Created
May 15, 2016 13:40
-
-
Save Renien/30dbb77ca85e4b0ec225c1874a8b37da to your computer and use it in GitHub Desktop.
We can find all frequent pairs by making two passes over the baskets. On the first pass, we count the items themselves, and then determine which items are frequent. On the second pass, we count only the pairs of items both of which are found frequently on the first pass. Monotonicity justifies our ignoring other pairs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'renienj' | |
import urllib | |
from itertools import combinations | |
import time | |
from collections import defaultdict | |
""" | |
Sample Data Sets | |
---------------- | |
FRO11987 ELE17451 ELE89019 SNA90258 GRO99222 | |
GRO99222 GRO12298 FRO12685 ELE91550 SNA11465 ELE26917 ELE52966 FRO90334 SNA30755 ELE17451 FRO84225 SNA80192 | |
ELE17451 GRO73461 DAI22896 SNA99873 FRO86643 | |
ELE17451 ELE37798 FRO86643 GRO56989 ELE23393 SNA11465 | |
""" | |
def main(file_location, support, itemset_size): | |
""" | |
file_location: The document location | |
support: The frequency count | |
itemset_size: The combine item size | |
""" | |
candidate_dct = defaultdict(lambda: 0) | |
for i in range(itemset_size): | |
now = time.time() | |
candidate_dct = data_pass(file_location, support, pass_nbr=i + 1, candidate_dct=candidate_dct) | |
print "pass number %i took %f and found %i candidates" % (i + 1, time.time() - now, len(candidate_dct)) | |
return candidate_dct | |
def update_candidates(item_lst, candidate_dct, pass_nbr): | |
""" | |
During the iteration the data collection update | |
the candidate dictionary according to the current pass_nbr. | |
- item_lst: Current list | |
- candidate_dct: The list of generated combine set | |
- pass_nbr: The pass # number | |
""" | |
if pass_nbr == 1: | |
for item in item_lst: | |
candidate_dct[(item,)] += 1 | |
else: | |
frequent_items_set = set() | |
for item_tuple in combinations(sorted(item_lst), pass_nbr - 1): | |
if item_tuple in candidate_dct: | |
frequent_items_set.update(item_tuple) | |
for item_set in combinations(sorted(frequent_items_set), pass_nbr): | |
candidate_dct[item_set] += 1 | |
return candidate_dct | |
def clear_items(candidate_dct, support, pass_nbr): | |
""" | |
This function will help to clear the candidate_dct object | |
according to the following associate rules | |
- support: The frequency of the sets | |
- pass_nbr: The pass # number | |
""" | |
for item_tuple, cnt in candidate_dct.items(): | |
if cnt < support or len(item_tuple) < pass_nbr: | |
del candidate_dct[item_tuple] | |
return candidate_dct | |
def data_pass(file_location, support, pass_nbr, candidate_dct): | |
""" | |
Generate the datapass according to the pass count | |
file_location: The document location | |
support: The frequency count | |
pass_nbr: The current pass count | |
candidate_dct: The list of generated combine set | |
""" | |
with open(file_location, 'r') as f: | |
for line in f: | |
item_lst = line.split() | |
candidate_dct = update_candidates(item_lst, candidate_dct, pass_nbr) | |
candidate_dct = clear_items(candidate_dct, support, pass_nbr) | |
return candidate_dct | |
if __name__ == "__main__": | |
file_location, headers = urllib.urlretrieve("http://goo.gl/w8nHnz") | |
support = 200 | |
itemset_size = 3 | |
itemsets_dct = main(file_location, support, itemset_size) | |
# Print the 1st 10 items | |
i = 0 | |
for itemset, frequency in itemsets_dct.iteritems(): | |
print itemset, frequency | |
i += 1 | |
if i == 10: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment