Skip to content

Instantly share code, notes, and snippets.

@Mizzlr
Last active June 14, 2016 09:50
Show Gist options
  • Save Mizzlr/90949647b8d25dd5c2e62e14a39bbe0e to your computer and use it in GitHub Desktop.
Save Mizzlr/90949647b8d25dd5c2e62e14a39bbe0e to your computer and use it in GitHub Desktop.
Aggregator Tree Data Structure
class ATree:
"Aggregator Tree Data Structure"
def __init__(self):
self.count = 0
self.children = {}
def __repr__(self):
return "<ATree: %d\n\t%s>" \
% ((self.count, self.children)) \
if self.children else "<ATree: %d>" \
% self.count
def insert(self, chunk=""):
"recursively insert the chunk"
if len(chunk) == 0:
self.count += 1
elif len(chunk) == 1:
self.add_child(chunk)
self.children[chunk].insert()
else:
head = chunk[0]
tail = chunk[1:]
self.add_child(head)
self.children[head].insert(chunk=tail)
def add_child(self, key):
"add child if not already present"
if not self.children.has_key(key):
self.children[key] = ATree()
def aggregate(self):
"aggregate the counts recursively"
for key in self.children.keys():
self.children[key].aggregate()
self.count += sum([child.count
for child in self.children.values()])
def get_count(self, chunk=""):
"recurse to get the count after aggregation"
count = 0
if chunk == "":
count = self.count
elif len(chunk) == 1 and \
self.children.has_key(chunk):
count = self.children[chunk].get_count()
else:
head = chunk[0]
tail = chunk[1:]
if self.children.has_key(head):
count = self.children[head].get_count(tail)
return count
def to_dict(self, chunk=""):
"recurse to compute global dict from ATree"
global atree_dict
atree_dict[chunk] = self.count
for key in self.children.keys():
self.children[key].to_dict(chunk=chunk+key)
def chunkify(self, block, size):
"breaks the block into chunks of give size"
assert(len(block) >= size)
for i in range(len(block)-size+1):
yield block[i:i+size]
def process(self, block, size):
"main algorithm for ATree"
self.count = 0
self.children = {}
# handle equal sized chunks
# print list(self.chunkify(block,size))
for chunk in self.chunkify(block, size):
self.insert(chunk=chunk)
self.aggregate()
# handle tail cases
lens = len(block) - size + 1
for i in range(size):
chunk = block[lens:lens+i]
self.insert(chunk=chunk)
self.to_dict()
atree_dict = {}
if __name__ == '__main__':
import json, random, tabulate
random.seed(1001)
block_size, chunk_size = 15, 3
# generate a random string of ATGC
block = "".join(random.choice("ATGC")
for _ in range(block_size))
# apply our ATree algorithm
atree = ATree()
atree.process(block,chunk_size)
print "Block of size=%d is" % block_size
print block
print "ATree as dict: "
print tabulate.tabulate(
sorted(atree_dict.items()),
headers=("chunk","count"),
tablefmt="fancy_grid")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment