Renien/a-priori-python.py

## a-priori-python.py
__author__ = 'renienj'

import urllib
from itertools import combinations
import time
from collections import defaultdict

"""
    Sample  Data Sets
    ----------------

    FRO11987 ELE17451 ELE89019 SNA90258 GRO99222
    GRO99222 GRO12298 FRO12685 ELE91550 SNA11465 ELE26917 ELE52966 FRO90334 SNA30755 ELE17451 FRO84225 SNA80192
    ELE17451 GRO73461 DAI22896 SNA99873 FRO86643
    ELE17451 ELE37798 FRO86643 GRO56989 ELE23393 SNA11465
"""


def main(file_location, support, itemset_size):
    """
     file_location: The document location
     support: The frequency count
     itemset_size: The combine item size
    """
    candidate_dct = defaultdict(lambda: 0)
    for i in range(itemset_size):
        now = time.time()
        candidate_dct = data_pass(file_location, support, pass_nbr=i + 1, candidate_dct=candidate_dct)
        print "pass number %i took %f and found %i candidates" % (i + 1, time.time() - now, len(candidate_dct))
    return candidate_dct


def update_candidates(item_lst, candidate_dct, pass_nbr):
    """
     During the iteration the data collection update
     the candidate dictionary according to the current pass_nbr.
        - item_lst: Current list
        - candidate_dct: The list of generated combine set
        - pass_nbr: The pass # number
    """
    if pass_nbr == 1:
        for item in item_lst:
            candidate_dct[(item,)] += 1
    else:
        frequent_items_set = set()
        for item_tuple in combinations(sorted(item_lst), pass_nbr - 1):
            if item_tuple in candidate_dct:
                frequent_items_set.update(item_tuple)

        for item_set in combinations(sorted(frequent_items_set), pass_nbr):
            candidate_dct[item_set] += 1

    return candidate_dct


def clear_items(candidate_dct, support, pass_nbr):
    """
     This function will help to clear the candidate_dct object
     according to the following associate rules
        - support: The frequency of the sets
        - pass_nbr: The pass # number
    """
    for item_tuple, cnt in candidate_dct.items():
        if cnt < support or len(item_tuple) < pass_nbr:
            del candidate_dct[item_tuple]
    return candidate_dct


def data_pass(file_location, support, pass_nbr, candidate_dct):
    """
     Generate the datapass according to the pass count

     file_location: The document location
     support: The frequency count
     pass_nbr: The current pass count
     candidate_dct: The list of generated combine set
    """
    with open(file_location, 'r') as f:
        for line in f:
            item_lst = line.split()
            candidate_dct = update_candidates(item_lst, candidate_dct, pass_nbr)

    candidate_dct = clear_items(candidate_dct, support, pass_nbr)

    return candidate_dct


if __name__ == "__main__":
    file_location, headers = urllib.urlretrieve("http://goo.gl/w8nHnz")

    support = 200
    itemset_size = 3
    itemsets_dct = main(file_location, support, itemset_size)

    # Print the 1st 10 items
    i = 0
    for itemset, frequency in itemsets_dct.iteritems():
        print itemset, frequency
        i += 1
        if i == 10:
            break
	__author__ = 'renienj'

	import urllib
	from itertools import combinations
	import time
	from collections import defaultdict

	"""
	Sample Data Sets
	----------------

	FRO11987 ELE17451 ELE89019 SNA90258 GRO99222
	GRO99222 GRO12298 FRO12685 ELE91550 SNA11465 ELE26917 ELE52966 FRO90334 SNA30755 ELE17451 FRO84225 SNA80192
	ELE17451 GRO73461 DAI22896 SNA99873 FRO86643
	ELE17451 ELE37798 FRO86643 GRO56989 ELE23393 SNA11465
	"""


	def main(file_location, support, itemset_size):
	"""
	file_location: The document location
	support: The frequency count
	itemset_size: The combine item size
	"""
	candidate_dct = defaultdict(lambda: 0)
	for i in range(itemset_size):
	now = time.time()
	candidate_dct = data_pass(file_location, support, pass_nbr=i + 1, candidate_dct=candidate_dct)
	print "pass number %i took %f and found %i candidates" % (i + 1, time.time() - now, len(candidate_dct))
	return candidate_dct


	def update_candidates(item_lst, candidate_dct, pass_nbr):
	"""
	During the iteration the data collection update
	the candidate dictionary according to the current pass_nbr.
	- item_lst: Current list
	- candidate_dct: The list of generated combine set
	- pass_nbr: The pass # number
	"""
	if pass_nbr == 1:
	for item in item_lst:
	candidate_dct[(item,)] += 1
	else:
	frequent_items_set = set()
	for item_tuple in combinations(sorted(item_lst), pass_nbr - 1):
	if item_tuple in candidate_dct:
	frequent_items_set.update(item_tuple)

	for item_set in combinations(sorted(frequent_items_set), pass_nbr):
	candidate_dct[item_set] += 1

	return candidate_dct


	def clear_items(candidate_dct, support, pass_nbr):
	"""
	This function will help to clear the candidate_dct object
	according to the following associate rules
	- support: The frequency of the sets
	- pass_nbr: The pass # number
	"""
	for item_tuple, cnt in candidate_dct.items():
	if cnt < support or len(item_tuple) < pass_nbr:
	del candidate_dct[item_tuple]
	return candidate_dct


	def data_pass(file_location, support, pass_nbr, candidate_dct):
	"""
	Generate the datapass according to the pass count

	file_location: The document location
	support: The frequency count
	pass_nbr: The current pass count
	candidate_dct: The list of generated combine set
	"""
	with open(file_location, 'r') as f:
	for line in f:
	item_lst = line.split()
	candidate_dct = update_candidates(item_lst, candidate_dct, pass_nbr)

	candidate_dct = clear_items(candidate_dct, support, pass_nbr)

	return candidate_dct


	if __name__ == "__main__":
	file_location, headers = urllib.urlretrieve("http://goo.gl/w8nHnz")

	support = 200
	itemset_size = 3
	itemsets_dct = main(file_location, support, itemset_size)

	# Print the 1st 10 items
	i = 0
	for itemset, frequency in itemsets_dct.iteritems():
	print itemset, frequency
	i += 1
	if i == 10:
	break