Skip to content

Instantly share code, notes, and snippets.

@metula
Last active August 6, 2016 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save metula/9709539 to your computer and use it in GitHub Desktop.
Save metula/9709539 to your computer and use it in GitHub Desktop.
import math
import random
def str_to_ascii(text):
""" Gets rid of on ASCII characters in text"""
return ''.join(ch for ch in text if ord(ch)<128)
def maxmatch(T, p, w=2**12-1, max_length=2**5-1):
""" finds a maximum match of length k<=2**5-1 in a
w long window, T[p:p+k] with T[p-m:p-m+k].
Returns m (offset) and k (match length) """
assert isinstance(T, str)
n = len(T)
maxmatch = 0
offset = 0
for m in range(1, min(p+1, w)):
current_length = 0
for k in range(0, min(max_length, n - p)):
# at this point, T[p-m:p-m+k]==T[p:p+k]
if T[p-m+k] != T[p+k]:
break
else:
current_length = k + 1
if maxmatch < current_length:
maxmatch = current_length
offset = m
return offset, maxmatch
def lz77_compress(text, w=2**12-1, max_length=2**5-1, min_length=2):
"""LZ77 compression of an ASCII text. Produces
a list comprising of either ASCII character
or by a pair [m,k] where m is an offset and
k is a match (both are non negative integers)"""
result = []
n = len(text)
p = 0
while p < n:
if ord(text[p]) >= 128: continue
m, k = maxmatch(text, p, w, max_length)
if k < min_length:
result.append(text[p]) # a single char
p += 1
else:
result.append([m,k]) # two or more chars in match
p += k
return result # produces a list composed of chars and pairs
def lz77_decompress(compressed, w=2**12-1, max_length=2**5-1):
"""LZ77 decompression from intermediate format to ASCII text"""
result = []
n = len(compressed)
p = 0
while p < n:
if type(compressed[p]) == str: # char, as opposed to a pair
result.append(compressed[p])
p += 1
else:
m, k = compressed[p]
p += 1
for i in range(0, k):
# append k times to result;
result.append(result[-m])
# fixed offset m "to the left" as result itself grows
return "".join(result)
def inter_to_bin(lst, w=2**12-1, max_length=2**5-1):
""" converts intermediate format compressed list
to a string of bits"""
offset_width = math.ceil(math.log(w,2))
match_width = math.ceil(math.log(max_length,2))
result = []
for elem in lst:
if type(elem) == str:
result.append("0")
result.append('{:07b}'.format(ord(elem)))
elif type(elem) == list:
result.append("1")
m, k = elem
result.append('{num:0{width}b}'.format(num=m, width=offset_width))
result.append('{num:0{width}b}'.format(num=k, width=match_width))
return "".join(ch for ch in result)
def bin_to_inter(compressed, w=2**12-1, max_length=2**5-1):
""" converts a compressed string of bits
to intermediate compressed format """
offset_width = math.ceil(math.log(w,2))
match_width = math.ceil(math.log(max_length,2))
result = []
n = len(compressed)
p = 0
while p < n:
if compressed[p] == "0": # single ASCII char
p += 1
char = chr(int(compressed[p:p+7], 2))
result.append(char)
p += 7
elif compressed[p] == "1": # repeat of several chars
p += 1
m = int(compressed[p:p+offset_width], 2)
p += offset_width
k = int(compressed[p:p+match_width], 2)
p += match_width
result.append([m,k])
return result
def download(url):
import urllib.request
return urllib.request.urlopen(url).read()
def random_string(n):
return "".join(chr(random.randrange(128)) for i in range(n))
NYT = 'http://www.nytimes.com/'
LZ = 'http://en.wikipedia.org/wiki/LZ77_and_LZ78'
def process(text):
atext = str_to_ascii(text)
inter = lz77_compress(atext)
bint = inter_to_bin(inter)
inter2 = bin_to_inter(bint)
text2 = lz77_decompress(inter)
return inter, bint, inter2, text2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment