Last active
June 14, 2016 09:50
-
-
Save Mizzlr/90949647b8d25dd5c2e62e14a39bbe0e to your computer and use it in GitHub Desktop.
Aggregator Tree Data Structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ATree: | |
"Aggregator Tree Data Structure" | |
def __init__(self): | |
self.count = 0 | |
self.children = {} | |
def __repr__(self): | |
return "<ATree: %d\n\t%s>" \ | |
% ((self.count, self.children)) \ | |
if self.children else "<ATree: %d>" \ | |
% self.count | |
def insert(self, chunk=""): | |
"recursively insert the chunk" | |
if len(chunk) == 0: | |
self.count += 1 | |
elif len(chunk) == 1: | |
self.add_child(chunk) | |
self.children[chunk].insert() | |
else: | |
head = chunk[0] | |
tail = chunk[1:] | |
self.add_child(head) | |
self.children[head].insert(chunk=tail) | |
def add_child(self, key): | |
"add child if not already present" | |
if not self.children.has_key(key): | |
self.children[key] = ATree() | |
def aggregate(self): | |
"aggregate the counts recursively" | |
for key in self.children.keys(): | |
self.children[key].aggregate() | |
self.count += sum([child.count | |
for child in self.children.values()]) | |
def get_count(self, chunk=""): | |
"recurse to get the count after aggregation" | |
count = 0 | |
if chunk == "": | |
count = self.count | |
elif len(chunk) == 1 and \ | |
self.children.has_key(chunk): | |
count = self.children[chunk].get_count() | |
else: | |
head = chunk[0] | |
tail = chunk[1:] | |
if self.children.has_key(head): | |
count = self.children[head].get_count(tail) | |
return count | |
def to_dict(self, chunk=""): | |
"recurse to compute global dict from ATree" | |
global atree_dict | |
atree_dict[chunk] = self.count | |
for key in self.children.keys(): | |
self.children[key].to_dict(chunk=chunk+key) | |
def chunkify(self, block, size): | |
"breaks the block into chunks of give size" | |
assert(len(block) >= size) | |
for i in range(len(block)-size+1): | |
yield block[i:i+size] | |
def process(self, block, size): | |
"main algorithm for ATree" | |
self.count = 0 | |
self.children = {} | |
# handle equal sized chunks | |
# print list(self.chunkify(block,size)) | |
for chunk in self.chunkify(block, size): | |
self.insert(chunk=chunk) | |
self.aggregate() | |
# handle tail cases | |
lens = len(block) - size + 1 | |
for i in range(size): | |
chunk = block[lens:lens+i] | |
self.insert(chunk=chunk) | |
self.to_dict() | |
atree_dict = {} | |
if __name__ == '__main__': | |
import json, random, tabulate | |
random.seed(1001) | |
block_size, chunk_size = 15, 3 | |
# generate a random string of ATGC | |
block = "".join(random.choice("ATGC") | |
for _ in range(block_size)) | |
# apply our ATree algorithm | |
atree = ATree() | |
atree.process(block,chunk_size) | |
print "Block of size=%d is" % block_size | |
print block | |
print "ATree as dict: " | |
print tabulate.tabulate( | |
sorted(atree_dict.items()), | |
headers=("chunk","count"), | |
tablefmt="fancy_grid") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment