Skip to content

Instantly share code, notes, and snippets.

@dimo414
Created June 24, 2012 02:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimo414/2981061 to your computer and use it in GitHub Desktop.
Save dimo414/2981061 to your computer and use it in GitHub Desktop.
Testing Prefix Lookup Data Structures
import bisect
import error;
class Prefix:
'''
A prefix data structure built on a sorted list, which uses binary search.
Note that for more reasonable lookup, it only searches in lower
case. This means there can be colliding strings such as 'prefix' vs 'Prefix'.
In this case, more recent inserts override older ones.
'''
def __init__(self, ls=[], presorted=False):
self._aliases = {}
self._list = ls if presorted else sorted(ls)
self._keys = [s.lower() for s in self._list]
# Note that since we usually use these methods together, it's wasteful to
# Compute prefix.lower() for both - as such, these methods assume prefix
# is already lower case.
def _getindex(self, prefix):
return bisect.bisect_left(self._keys, prefix)
def _getnextindex(self, prefix):
# http://stackoverflow.com/a/7381253/113632
lo, hi = 0, len(self._keys)
while lo < hi:
mid = (lo+hi)//2
if prefix < self._keys[mid] and not self._keys[mid].startswith(prefix): hi = mid
else: lo = mid+1
return lo
def __getitem__(self, prefix):
'''Return the item with the given prefix.
If more than one item matches the prefix an AmbiguousPrefix exception
will be raised, unless the prefix is the entire ID of one task.
If no items match the prefix an UnknownPrefix exception will be raised.
If an item exactly matches the prefix, it will be returned even if
there exist other (longer) items which match the prefix
'''
pre = prefix.lower()
ret = self._list[self._getindex(pre):self._getnextindex(pre)]
if ret:
if len(ret) == 1 or ret[0].lower() == pre:
return ret[0]
raise error.AmbiguousPrefix(prefix,ret)
raise error.UnknownPrefix(prefix)
def prefix(self, item):
'''Return the unique prefix of the given item, or None if not found'''
ln = len(self._keys)
item = item.lower()
index = self._getindex(item)
if index >= ln:
return None
match = self._keys[index]
if not match.startswith(item):
return None
siblings = []
if index > 0:
siblings.append(self._keys[index-1])
if index < ln-1:
siblings.append(self._keys[index+1])
if not siblings: #list contains only item
return match[0]
return self._uniqueprefix(match,siblings)
def _uniqueprefix(self,match,others):
'''Returns the unique prefix of match, against the set of others'''
ret = []
#print("START:",match,others)
while match:
others = [s[1:] for s in others if s and s[0] == match[0]]
ret.append(match[0])
match = match[1:]
#print("WHILE:",match,others,''.join(ret))
if not others:
return ''.join(ret)
return None
def add(self,item):
'''Add an item to the data structure.
This uses list.insert() which is O(n) - for many insertions,
it may be dramatically faster to simply build a new Prefix entirely.'''
lower = item.lower()
index = self._getindex(lower)
# If overwriting same key
if index < len(self._keys) and self._keys[index] == lower:
self._list[index] = item
else:
self._keys.insert(index,lower)
self._list.insert(index,item)
def alias(self,alias,item):
'''Add an item to the trie which maps to another item'''
self._aliases[alias] = self[item]
self.add(alias)
def pref_str(self,pref,short=False):
'''returns the item with a colon indicating the shortest unique prefix'''
item = self[pref]
pref = self.prefix(item)
tail = item[len(pref):]
return item[:len(pref)]+':'+(tail[:4]+('...' if len(tail) > 4 else '') if short else tail)
def __iter__(self):
return iter(self._list)
class UnknownPrefix(Exception):
'''Raised if a given prefix does not map to any items'''
def __init__(self,prefix,*args):
Exception.__init__(self,*args)
self.prefix = prefix
class AmbiguousPrefix(Exception):
'''Raised if a given prefix maps to multiple items'''
def __init__(self,prefix,choices,*args):
Exception.__init__(self,*args)
self.prefix = prefix
self.choices = choices
import hashlib,time
import bprefix,tprefix
def timed(f):
def func(*args):
start = time.time()
ret = f(*args)
took = time.time() - start
print("%s took %f" % (f.__name__,took))
return ret
return func
def get_generator(top=250000,static="Static_String"):
return (hashlib.sha1((static+str(i)).encode('utf-8')).hexdigest() for i in range(top))
@timed
def build_from_list(cls):
return cls(get_generator())
@timed
def build_from_adds(cls):
pref = cls()
for s in get_generator():
pref.add(s)
return pref
@timed
def add_to(obj):
for s in get_generator(10000,"Different_String"):
obj.add(s)
@timed
def get(obj,loops=10000):
for _ in range(loops):
obj['000188']
obj['1971e']
obj['336f7']
obj['4d120']
obj['66ada']
obj['80736']
obj['99cb0']
obj['b38f3']
obj['ccfd9e8']
obj['e61df']
@timed
def prefix(obj,loops=10000):
for _ in range(loops):
obj.prefix('00018855b442bfba15fae6949982ef63d9eba1c9')
obj.prefix('1971e17df8ee57f0dcceccc869db454b7c6b7a54')
obj.prefix('336f7b09c7c0c933b1f26ca09a84585818046e6b')
obj.prefix('4d1209fadf843f65bd2beee37db552134a930395')
obj.prefix('66adaf0cb611d71554153631611f7904781addef')
obj.prefix('80736f454201d96c4c795bb5e21778550a9cbef0')
obj.prefix('99cb006fd81cb84cfbae834ed7b7f977c29af249')
obj.prefix('b38f3561591650708fce739536ac504f86fecdf5')
obj.prefix('ccfd9e8211621666c55f911d1ff3f13ab93f696e')
obj.prefix('e61df82a0c2f59394eb9bd752e93b9c011df5be2')
@timed
def iter(obj):
for s in obj:
len(s)
def run_tests(cls):
pref = build_from_list(cls)
build_from_adds(cls)
add_to(pref)
get(pref)
prefix(pref)
iter(pref)
if __name__ == '__main__':
print("TRIE STRUCTURE")
run_tests(tprefix.Prefix)
print("BINARY SEARCH STRUCTURE")
run_tests(bprefix.Prefix)
import error;
class Prefix:
'''
A trie datastructure (http://en.wikipedia.org/wiki/Trie) which enables
fast data retrieval by prefix.
Note that for more reasonable lookup, the trie only searches in lower
case. This means there can be colliding strings such as 'prefix' vs 'Prefix'.
In this case, more recent inserts override older ones.
'''
def __init__(self,ls=[]):
self._root = _Node(None,None,True)
self._aliases = {}
for i in ls:
self._root.add(i.lower(),i)
def __getitem__(self, prefix):
'''Return the item with the given prefix.
If more than one item matches the prefix an AmbiguousPrefix exception
will be raised, unless the prefix is the entire ID of one task.
If no items match the prefix an UnknownPrefix exception will be raised.
If an item exactly matches the prefix, it will be returned even if
there exist other (longer) items which match the prefix
'''
matched = self._root[prefix.lower()]
if (matched is None or
(matched.result is None and len(matched.children) == 0) or
# this is needed because prefix will return if prefix
# is longer than necessary, and the necessary part
# matches, but the extra text does not
(matched.key is not None and not matched.key.startswith(prefix.lower()))):
raise error.UnknownPrefix(prefix)
if matched.result is not None:
if matched.result in self._aliases:
return self._aliases[matched.result]
else: return matched.result
else:
raise error.AmbiguousPrefix(prefix,iter(matched))
def prefix(self,item):
'''Return the unique prefix of the given item, or None if not found'''
return self._root.prefix(item.lower())
def pref_str(self,pref,short=False):
'''returns the item with a colon indicating the shortest unique prefix'''
item = self[pref]
pref = self.prefix(item)
tail = item[len(pref):]
return item[:len(pref)]+':'+(tail[:4]+('...' if len(tail) > 4 else '') if short else tail)
def add(self,item):
'''Add an item to the data structure'''
self._root.add(item.lower(),item,0)
def alias(self,alias,item):
'''Add an item to the trie which maps to another item'''
self._aliases[alias] = self[item]
self.add(alias)
def __iter__(self):
return iter(self._root)
class _Node:
'''Represents a node in the Trie. It contains either
an exact match, a set of children, or both
'''
def __init__(self, key, item, final=False):
'''Constructs a new node which contains an item'''
self.final = final
self.key = key
self.result = item
self.children = {}
def _tree(self):
'''Returns a tree structure representing the trie. Useful for debugging'''
return "( %s%s, { %s } )" % (self.result, '*' if self.final else '',
', '.join(["%s: %s" % (k,v._tree()) for (k,v) in self.children.items()]))
def add(self,key,item,depth=0):
'''Adds an item at this node or deeper. Depth indicates
which index is being used for comparison'''
# the correct node has been found, replace result with new value
if self.key is not None and key == self.key:
self.result = item #this would override an old value
return
# this is currently a leaf node, move the leave one down
if self.result is not None and not self.final:
if self.key == None: print(self.key,self.result,key,item)
key_letter = self.key[depth]
self.children[key_letter] = _Node(self.key,self.result,len(self.key)==depth+1)
self.key = None
self.result = None
if len(item) == depth:
self.key = key
self.result = item #this could override an old value
self.final = True
return
letter = key[depth]
if letter in self.children:
child = self.children[letter]
child.add(key,item,depth+1)
else:
self.children[letter] = _Node(key,item,len(key) == depth+1)
def __getitem__(self,prefix):
'''Given a prefix, returns the node that matches
This will either have a result, or if not the prefix
was ambiguous. If None is returned, there was no
such prefix'''
if len(prefix) == 0 or len(self.children) == 0:
return self
letter = prefix[0]
if letter in self.children:
return self.children[letter][prefix[1:]]
else: return None
def prefix(self,item):
'''Given an item (or a prefix) finds the shortest
prefix necessary to reach the given item.
None if item does not exist.'''
if len(item) == 0 or len(self.children) == 0:
return ''
letter = item[0]
if letter in self.children:
child = self.children[letter].prefix(item[1:])
if child is not None:
return letter + child
return None
def __iter__(self):
'''Yields items in and below this node'''
if self.result:
yield self.result
for k in sorted(self.children.keys()):
for res in self.children[k]:
yield res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment