Skip to content

Instantly share code, notes, and snippets.

@josiahcarlson
Created May 6, 2013 16:29
Show Gist options
  • Save josiahcarlson/5526230 to your computer and use it in GitHub Desktop.
Save josiahcarlson/5526230 to your computer and use it in GitHub Desktop.
Encode/decode sequences of integers to offer the ability to easily compare encoded sequences of integers as though they were decoded (which can be used to implement array comparisons in a database that doesn't offer native arrays).
'''
prefix_code.py
Copyright 2012-2013 Josiah Carlson
Released under the GNU LGPL v 2.1 license
This module offers the ability to encode/decode a sequence of integers into
strings that can then be used to compare sequences of integers (or paths on
trees) quickly. This module was originally intended to be used in a case-
preserving index in a relational database (where 'Z' comes before 'a', as is
the case with binary coallation).
Methods *very similar* to this were used to handle paginated tree-structured
comments for the YouTube Groups forums (which was used from August 2009 to
December 2010). My impotice to write this code came as a result of reading the
following blog post, an realizing that I'd solved the same problem a couple
years before:
http://justcramer.com/2012/04/08/using-arrays-as-materialized-paths-in-postgres/
I don't have a name for this type of coding, though it shares some concepts
of golomb codes (specifying the length of the sub-code, while at the same time
encoding information about how many bytes are used to encode the number.
There are a few other similar ideas that could (for example) use 1 bit of
every X grouped bits to offer magnitude scaling (which would be 1 bit of every
6 bits below), but those methods do not offer the same sort of comparison
consistency that this method offers.
'''
from itertools import chain
import string
# The choice and order of these values is such that each character is sorted
# with respect to other characters, and the relative order of integers is the
# same before and after conversion to a string.
to_char = '!.' + string.digits + string.uppercase + string.lowercase
to_int = dict((k,v) for v,k in enumerate(to_char))
cutoff = len(to_char) - 11
# 0 ... cutoff-1 -> single char
# with X digits, x > 1 you can represent values:
# cutoff + len(to_char)**(X-2) - 1 to cutoff + len(to_char)**(X-1) - 1
# So with a a cutoff of 64 - 11, you can represent 0 to 2**66 + cutoff - 1
def to_chars(value):
'''
Converts an integer to its text-encoded form.
'''
if value < cutoff:
return to_char[value]
value -= cutoff
out = []
while value:
value, rem = divmod(value, len(to_char))
out.append(to_char[rem])
# handle when value == cutoff
if not out:
out.append(to_char[value])
out.append(to_char[cutoff + len(out) - 1])
out.reverse()
return ''.join(out)
def to_val(chars):
'''
Converts characters from the provided string back into their integer
representation, and returns both the desired integer as well as the number
of bytes consumed from the character string (this function can accept a
string that is the result of a concatenation of results from to_chars() ).
'''
first = to_int[chars[0]]
if first < cutoff:
return first, 1
first -= cutoff - 1
dec = []
for ch in chars[1:1+first]:
dec.append(to_int[ch])
value = dec.pop() + cutoff
m = len(to_char)
while dec:
value += m * dec.pop()
m *= len(to_char)
return value, first + 1
def from_sequence(lst):
'''
Converts a sequence of integers into a string that represents those
integers.
'''
return ''.join(map(to_chars, lst))
def from_string(str):
'''
Converts a string that represents a sequence of integers back into a list
of integers.
'''
out = []
i = 0
while i < len(str):
this, used = to_val(str[i:])
out.append(this)
i += used
return out
def from_sequence_delta(lst):
'''
The same as from_sequence(), except that each subsequent integer has the
previous integer subtracted from it. This can reduce data storage
somewhat, depending on your data.
This method requires that all subsequent values in the sequence are at
least as large as those proceeding it. Which is to say the following are
good sequences:
[1, 2, 3]
[1, 1, 1, 2, 2, 2]
But the following are bad sequences:
[1, 2, 3, 2]
[1, 0]
'''
return from_sequence(this - prev for prev, this in zip(chain([0], lst), lst))
def from_string_delta(str):
'''
The reverse of from_sequence_delta().
'''
d = from_string(str)
for i in xrange(1, len(d)):
d[i] += d[i-1]
return d
def test():
import random
for i in xrange(0, 1024, 100):
x = [i]
y = from_sequence(x)
z = from_string(y)
assert x == z, (x, y, z)
print (x, y, z)
x = [2**random.randrange(10) + random.randrange(128) for i in xrange(10)]
y = from_sequence(x)
z = from_string(y)
assert x == z, (x, y, z)
print (x, y, z)
x.sort()
y = from_sequence_s(x)
z = from_string_s(y)
assert x == z, (x, y, z)
print (x, y, z)
x = [2**66 + cutoff - 1]
y = from_sequence(x)
z = from_string(y)
assert x == z, (x, y, z)
print (x, y, z)
for i in xrange(1000):
x = [[random.randrange(10000)] for i in xrange(100)]
y = map(from_sequence, x)
x.sort()
y.sort()
z = map(from_string, y)
assert x == z, (x, y, z)
print hex(from_string('zzzzzzzzzzzz')[0] - cutoff).count('f')
if __name__ == '__main__':
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment