metula/LZ.py

## LZ.py
import math
import random

def str_to_ascii(text):
  """ Gets rid of on ASCII characters in text"""
  return ''.join(ch for ch in text if ord(ch)<128)

def maxmatch(T, p, w=2**12-1, max_length=2**5-1):
  """ finds a maximum match of length k<=2**5-1 in a
  w long window, T[p:p+k] with T[p-m:p-m+k].
  Returns m (offset) and k (match length) """
  assert isinstance(T, str)
  n = len(T)
  maxmatch = 0
  offset = 0
  for m in range(1, min(p+1, w)):
    current_length = 0
    for k in range(0, min(max_length, n - p)):
      # at this point, T[p-m:p-m+k]==T[p:p+k]
      if T[p-m+k] != T[p+k]:
        break
      else:
        current_length = k + 1
    if maxmatch < current_length:
      maxmatch = current_length
      offset = m
  return offset, maxmatch

def lz77_compress(text, w=2**12-1, max_length=2**5-1, min_length=2):
  """LZ77 compression of an ASCII text. Produces
  a list comprising of either ASCII character
  or by a pair [m,k] where m is an offset and
  k is a match (both are non negative integers)"""
  result = []
  n = len(text)
  p = 0
  while p < n:
    if ord(text[p]) >= 128: continue
    m, k = maxmatch(text, p, w, max_length)
    if k < min_length:
      result.append(text[p])  #  a single char
      p += 1
    else:
      result.append([m,k])   # two or more chars in match
      p += k
  return result  # produces a list composed of chars and pairs

def lz77_decompress(compressed, w=2**12-1, max_length=2**5-1):
   """LZ77 decompression from intermediate format to ASCII text"""
   result = []
   n = len(compressed)
   p = 0
   while p < n:
      if type(compressed[p]) == str:  # char, as opposed to a pair
         result.append(compressed[p])
         p += 1
      else:
         m, k = compressed[p]
         p += 1
         for i in range(0, k):
            # append k times to result;
            result.append(result[-m])
            # fixed offset m "to the left" as result itself grows
   return "".join(result)


def inter_to_bin(lst, w=2**12-1, max_length=2**5-1):
   """ converts intermediate format compressed list
       to a string of bits"""
   offset_width = math.ceil(math.log(w,2))
   match_width = math.ceil(math.log(max_length,2))
   result = []
   for elem in lst:
      if type(elem) == str:
         result.append("0")
         result.append('{:07b}'.format(ord(elem)))
      elif type(elem) == list:
         result.append("1")
         m, k = elem
         result.append('{num:0{width}b}'.format(num=m, width=offset_width))
         result.append('{num:0{width}b}'.format(num=k, width=match_width))

   return "".join(ch for ch in result)

def bin_to_inter(compressed, w=2**12-1, max_length=2**5-1):
   """ converts a compressed string of bits
       to intermediate compressed format """
   offset_width = math.ceil(math.log(w,2))
   match_width = math.ceil(math.log(max_length,2))
   result = []
   n = len(compressed)
   p = 0
   while p < n:
      if compressed[p] == "0":  # single ASCII char
         p += 1
         char = chr(int(compressed[p:p+7], 2))
         result.append(char)
         p += 7
      elif compressed[p] == "1":  # repeat of several chars
         p += 1
         m = int(compressed[p:p+offset_width], 2)
         p += offset_width
         k = int(compressed[p:p+match_width], 2)
         p += match_width
         result.append([m,k])
   return result

def download(url):
  import urllib.request
  return urllib.request.urlopen(url).read()

def random_string(n):
  return "".join(chr(random.randrange(128)) for i in range(n))

NYT = 'http://www.nytimes.com/'
LZ = 'http://en.wikipedia.org/wiki/LZ77_and_LZ78'

def process(text):
  atext = str_to_ascii(text)
  inter = lz77_compress(atext)
  bint = inter_to_bin(inter)
  inter2 = bin_to_inter(bint)
  text2 = lz77_decompress(inter)
  return inter, bint, inter2, text2
	import math
	import random

	def str_to_ascii(text):
	""" Gets rid of on ASCII characters in text"""
	return ''.join(ch for ch in text if ord(ch)<128)

	def maxmatch(T, p, w=212-1, max_length=25-1):
	""" finds a maximum match of length k<=2**5-1 in a
	w long window, T[p:p+k] with T[p-m:p-m+k].
	Returns m (offset) and k (match length) """
	assert isinstance(T, str)
	n = len(T)
	maxmatch = 0
	offset = 0
	for m in range(1, min(p+1, w)):
	current_length = 0
	for k in range(0, min(max_length, n - p)):
	# at this point, T[p-m:p-m+k]==T[p:p+k]
	if T[p-m+k] != T[p+k]:
	break
	else:
	current_length = k + 1
	if maxmatch < current_length:
	maxmatch = current_length
	offset = m
	return offset, maxmatch

	def lz77_compress(text, w=212-1, max_length=25-1, min_length=2):
	"""LZ77 compression of an ASCII text. Produces
	a list comprising of either ASCII character
	or by a pair [m,k] where m is an offset and
	k is a match (both are non negative integers)"""
	result = []
	n = len(text)
	p = 0
	while p < n:
	if ord(text[p]) >= 128: continue
	m, k = maxmatch(text, p, w, max_length)
	if k < min_length:
	result.append(text[p]) # a single char
	p += 1
	else:
	result.append([m,k]) # two or more chars in match
	p += k
	return result # produces a list composed of chars and pairs

	def lz77_decompress(compressed, w=212-1, max_length=25-1):
	"""LZ77 decompression from intermediate format to ASCII text"""
	result = []
	n = len(compressed)
	p = 0
	while p < n:
	if type(compressed[p]) == str: # char, as opposed to a pair
	result.append(compressed[p])
	p += 1
	else:
	m, k = compressed[p]
	p += 1
	for i in range(0, k):
	# append k times to result;
	result.append(result[-m])
	# fixed offset m "to the left" as result itself grows
	return "".join(result)


	def inter_to_bin(lst, w=212-1, max_length=25-1):
	""" converts intermediate format compressed list
	to a string of bits"""
	offset_width = math.ceil(math.log(w,2))
	match_width = math.ceil(math.log(max_length,2))
	result = []
	for elem in lst:
	if type(elem) == str:
	result.append("0")
	result.append('{:07b}'.format(ord(elem)))
	elif type(elem) == list:
	result.append("1")
	m, k = elem
	result.append('{num:0{width}b}'.format(num=m, width=offset_width))
	result.append('{num:0{width}b}'.format(num=k, width=match_width))

	return "".join(ch for ch in result)

	def bin_to_inter(compressed, w=212-1, max_length=25-1):
	""" converts a compressed string of bits
	to intermediate compressed format """
	offset_width = math.ceil(math.log(w,2))
	match_width = math.ceil(math.log(max_length,2))
	result = []
	n = len(compressed)
	p = 0
	while p < n:
	if compressed[p] == "0": # single ASCII char
	p += 1
	char = chr(int(compressed[p:p+7], 2))
	result.append(char)
	p += 7
	elif compressed[p] == "1": # repeat of several chars
	p += 1
	m = int(compressed[p:p+offset_width], 2)
	p += offset_width
	k = int(compressed[p:p+match_width], 2)
	p += match_width
	result.append([m,k])
	return result

	def download(url):
	import urllib.request
	return urllib.request.urlopen(url).read()

	def random_string(n):
	return "".join(chr(random.randrange(128)) for i in range(n))

	NYT = 'http://www.nytimes.com/'
	LZ = 'http://en.wikipedia.org/wiki/LZ77_and_LZ78'

	def process(text):
	atext = str_to_ascii(text)
	inter = lz77_compress(atext)
	bint = inter_to_bin(inter)
	inter2 = bin_to_inter(bint)
	text2 = lz77_decompress(inter)
	return inter, bint, inter2, text2