Skip to content

Instantly share code, notes, and snippets.

@nodirt
Created November 7, 2017 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nodirt/dc708ab1ff3f7ed39cd0c069218e8aa5 to your computer and use it in GitHub Desktop.
Save nodirt/dc708ab1ff3f7ed39cd0c069218e8aa5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import base64, bisect, genericpath, string, zlib
def compress(items):
class Node(object):
def __init__(self):
self.children = {}
self.descendants = 0
self.value = False
def compress(self):
def visit(self, self_key):
self.children = {
visit(child, key): child
for key, child in self.children.iteritems()
if child.children or child.value
}
if self_key and len(self.children) == 1 and not self.value:
[key, child], = self.children.iteritems()
self_key += key
self.children = child.children
self.value = child.value
return self_key
visit(self, None)
def update_descendants(self):
self.descendants = 0
for c in self.children.itervalues():
c.update_descendants()
self.descendants += 1 + c.descendants
def dump(self, out):
for key, child in sorted(self.children.iteritems()):
if child.descendants > 0:
sep = '+' if child.value else ' '
out.append('%d%s%s' % (child.descendants, sep, key))
elif child.value:
out.append(key)
child.dump(out)
root = Node()
for item in items:
node = root
for c in item:
child = node.children.get(c)
if not child:
child = Node()
node.children[c] = child
node = child
node.value = True
root.compress()
root.update_descendants()
out = []
root.dump(out)
return out
def is_present(d, k):
i = 0
iterations = 0
found = False
orig_k = k
while k and i < len(d):
iterations += 1
key = d[i]
descendants = 0
value = True
if ' ' in key or '+' in key:
value = '+' in key
sep = '+' if value else ' '
descendants, key = key.split(sep, 2)
descendants = int(descendants)
if k == key and value:
found = True
break
if k.startswith(key):
if descendants == 0:
break
k = k[len(key):]
i += 1 # we must go deeper
elif key > k:
break
else:
i += descendants + 1 # next sibling
print('search for %s took %d iterations' % (orig_k, iterations))
return found
def test2():
engs = open('eng.txt').read().splitlines()
flat_engs = '\n'.join(engs)
d = compress(engs)
flat_d = '\n'.join(d)
open('eng.tree', 'w').write(flat_d)
print 'Raw data descendants: %d' % len(flat_engs)
print 'Encoded descendants: %d' % len(flat_d)
print 'Compressed descendants: %d' % len(zlib.compress(flat_d))
print 'Compressed base64: %d' % len(base64.b64encode(zlib.compress(flat_d)))
# Show how it looks like:
print flat_d[:100]
decoded = flat_d.split('\n')
matches = ['maruel', 'vadimsh']
for m in matches:
assert is_present(decoded, m), m
misses = ['xxxxx', 'aaaaaaa', 'qwerty']
for m in misses:
assert not is_present(decoded, m), m
test2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment