Last active
August 6, 2016 14:16
-
-
Save metula/9709539 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import random | |
def str_to_ascii(text): | |
""" Gets rid of on ASCII characters in text""" | |
return ''.join(ch for ch in text if ord(ch)<128) | |
def maxmatch(T, p, w=2**12-1, max_length=2**5-1): | |
""" finds a maximum match of length k<=2**5-1 in a | |
w long window, T[p:p+k] with T[p-m:p-m+k]. | |
Returns m (offset) and k (match length) """ | |
assert isinstance(T, str) | |
n = len(T) | |
maxmatch = 0 | |
offset = 0 | |
for m in range(1, min(p+1, w)): | |
current_length = 0 | |
for k in range(0, min(max_length, n - p)): | |
# at this point, T[p-m:p-m+k]==T[p:p+k] | |
if T[p-m+k] != T[p+k]: | |
break | |
else: | |
current_length = k + 1 | |
if maxmatch < current_length: | |
maxmatch = current_length | |
offset = m | |
return offset, maxmatch | |
def lz77_compress(text, w=2**12-1, max_length=2**5-1, min_length=2): | |
"""LZ77 compression of an ASCII text. Produces | |
a list comprising of either ASCII character | |
or by a pair [m,k] where m is an offset and | |
k is a match (both are non negative integers)""" | |
result = [] | |
n = len(text) | |
p = 0 | |
while p < n: | |
if ord(text[p]) >= 128: continue | |
m, k = maxmatch(text, p, w, max_length) | |
if k < min_length: | |
result.append(text[p]) # a single char | |
p += 1 | |
else: | |
result.append([m,k]) # two or more chars in match | |
p += k | |
return result # produces a list composed of chars and pairs | |
def lz77_decompress(compressed, w=2**12-1, max_length=2**5-1): | |
"""LZ77 decompression from intermediate format to ASCII text""" | |
result = [] | |
n = len(compressed) | |
p = 0 | |
while p < n: | |
if type(compressed[p]) == str: # char, as opposed to a pair | |
result.append(compressed[p]) | |
p += 1 | |
else: | |
m, k = compressed[p] | |
p += 1 | |
for i in range(0, k): | |
# append k times to result; | |
result.append(result[-m]) | |
# fixed offset m "to the left" as result itself grows | |
return "".join(result) | |
def inter_to_bin(lst, w=2**12-1, max_length=2**5-1): | |
""" converts intermediate format compressed list | |
to a string of bits""" | |
offset_width = math.ceil(math.log(w,2)) | |
match_width = math.ceil(math.log(max_length,2)) | |
result = [] | |
for elem in lst: | |
if type(elem) == str: | |
result.append("0") | |
result.append('{:07b}'.format(ord(elem))) | |
elif type(elem) == list: | |
result.append("1") | |
m, k = elem | |
result.append('{num:0{width}b}'.format(num=m, width=offset_width)) | |
result.append('{num:0{width}b}'.format(num=k, width=match_width)) | |
return "".join(ch for ch in result) | |
def bin_to_inter(compressed, w=2**12-1, max_length=2**5-1): | |
""" converts a compressed string of bits | |
to intermediate compressed format """ | |
offset_width = math.ceil(math.log(w,2)) | |
match_width = math.ceil(math.log(max_length,2)) | |
result = [] | |
n = len(compressed) | |
p = 0 | |
while p < n: | |
if compressed[p] == "0": # single ASCII char | |
p += 1 | |
char = chr(int(compressed[p:p+7], 2)) | |
result.append(char) | |
p += 7 | |
elif compressed[p] == "1": # repeat of several chars | |
p += 1 | |
m = int(compressed[p:p+offset_width], 2) | |
p += offset_width | |
k = int(compressed[p:p+match_width], 2) | |
p += match_width | |
result.append([m,k]) | |
return result | |
def download(url): | |
import urllib.request | |
return urllib.request.urlopen(url).read() | |
def random_string(n): | |
return "".join(chr(random.randrange(128)) for i in range(n)) | |
NYT = 'http://www.nytimes.com/' | |
LZ = 'http://en.wikipedia.org/wiki/LZ77_and_LZ78' | |
def process(text): | |
atext = str_to_ascii(text) | |
inter = lz77_compress(atext) | |
bint = inter_to_bin(inter) | |
inter2 = bin_to_inter(bint) | |
text2 = lz77_decompress(inter) | |
return inter, bint, inter2, text2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment