Skip to content

Instantly share code, notes, and snippets.

@josh-howes
Created November 17, 2015 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josh-howes/439ef4fd7d1893ec9cf5 to your computer and use it in GitHub Desktop.
Save josh-howes/439ef4fd7d1893ec9cf5 to your computer and use it in GitHub Desktop.
Han, Pei and Yin's novel frequent pattern tree (FP-tree) structure
"""Mining Frequent Patterns without Candidate Generation.
Naive implementation of the data structures in the following paper <http://hanj.cs.illinois.edu/pdf/sigmod00.pdf>
"""
class HeaderTable(dict):
"""Header Table.
To facilitate tree traversal, an item header table is built in which each item points to its
occurrence in the tree via a head of node-link. Nodes with the same item-name are linked in
sequence via such node-links.
"""
def follow_node_links(self, item_name):
if item_name not in self.keys():
return None
header_node = self.get(item_name)
node = header_node
while node.node_link:
node = none.node_link
return node
class FPTree(object):
def __init__(self, item_name='null', count=0, node_link=None, parent=None):
"""FP Node.
Item Prefix Subtree of the Frequent Pattern Tree.
Arguments
---------
item_name : str
Registers which item this node represents
count : int
The number of transactions represented by the portion of the path reaching this node.
children : list
A set of item prefix subtrees as the children of the root.
node_link : `FPNode`
Node in the FP-tree carrying the same item-name or null if there is none.
header_table : dict
Frequent-item header table.
"""
self.item_name = item_name
self.count = count
self.node_link = node_link
self.subtrees = list()
self._lookup = dict()
self._tree_count = 0
def __repr__(self):
return '<FPNode(name="{0}", count={1})>'.format(self.item_name, self.count)
def has_child(self, name):
return name in self._lookup.keys()
def get_subtree(self, name):
return self.subtrees[self._lookup[name]]
def insert_tree(self, transactions):
head, rest = transactions[0], transactions[1:]
if self.has_child(head):
tree = self.get_subtree(head)
tree.count += 1
else:
tree = FPTree(item_name=head,
count=1,
parent=self)
self._lookup[head] = self._tree_count
self._tree_count += 1
self.subtrees.append(tree)
# Add node_links
parent = header_table.follow_node_links(head)
if not parent:
header_table[head] = tree
else:
parent.node_link = tree
if rest:
tree.insert_tree(rest)
header_table = HeaderTable()
def create_fptree(transactions):
"""Create a Frequent Pattern Tree from a list of transactions.
Arguments
---------
transactions : list
List of transactions, where a transaction is a set of items.
Returns
-------
fp_tree : `FPTree`
"""
counts = defaultdict(int)
for trans in transactions:
for item in trans:
counts[item] += 1
tree = FPTree(None)
for trans in transactions:
trans.sort(key=lambda x: counts[x], reverse=True)
tree.insert_tree(trans)
return tree
def print_tree(tree):
"""Print Tree.
Print the structure of a Frequent Pattern Tree to stdout.
Arguments
---------
tree : `FPTree`
"""
def _print_tree(tree, level=0):
assert isinstance(tree, FPTree)
print("{pad}+-{node}".format(pad=" " * (level), node=str(tree)))
for child in tree.subtrees:
_print_tree(child, level+1)
_print_tree(tree)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment