Created
April 2, 2015 14:03
-
-
Save jogonba2/88dcd37fae56645d43d5 to your computer and use it in GitHub Desktop.
Posting lists compression.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
def get_gaps_list(posting_lists): return [posting_lists[0]]+[posting_lists[i]-posting_lists[i-1] for i in xrange(1,len(posting_lists))] | |
def vb_encode_number(number): | |
bytex = [] | |
while True: | |
bytex.insert(0,number%128); | |
if number<128: break | |
else: number /= 128 | |
bytex[len(bytex)-1] |= 0b10000000 | |
return bytex | |
def vb_encode(numbers): return [vb_encode_number(number) for number in numbers] | |
def vb_decode_number(number_list): | |
number,n = 0,0 | |
for x in xrange(len(number_list)): | |
if number_list[x]<128: n = 128 * n + number_list[x] | |
else: number = n = 128 * n + (number_list[x]-128) ; n=0 | |
return number | |
def vb_decode(numbers_encoded): return [vb_decode_number(number) for number in numbers_encoded] | |
if __name__ == "__main__": | |
posting_lists = [824,829,215406] | |
print "Posting lists: " , posting_lists | |
gap_list = get_gaps_list(posting_lists) | |
print "Gap list: " ,gap_list | |
numbers_encoded = vb_encode(gap_list) | |
print "Gaps encoded using less bytes for each number: ",numbers_encoded | |
numbers_decoded = vb_decode(numbers_encoded) | |
print "Gaps decoded: " , numbers_decoded |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment