fbwright/novel_zip.py

## novel_zip.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import sys, time
if sys.version_info.major < 3:
	input = raw_input

def compress(data):
	freq = {}
	for line in data.splitlines():
		for word in line.split():
			while word and word[0] in PRE_PUNCTUATION:
				word = word[1:]
			while word and word[-1] in POST_PUNCTUATION:
				word = word[:-1]
			for word in word.split('-'):
				if not word:
					pass
				elif word.lower() in freq:
					freq[word.lower()] += 1
				else:
					freq[word.lower()] = 1
	dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1])
	dict = list(zip(*dict))[0]
	chunks = []

	for line in data.splitlines():
		for word in line.split():
			mark = []
			while word and word[0] in PRE_PUNCTUATION:
				chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])])
				chunks.append(chunk)
				word = word[1:]
			while word and word[-1] in POST_PUNCTUATION:
				chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])])
				mark.append(chunk)
				word = word[:-1]

			split_dash = word.split('-')
			for word in split_dash:
				if not word:
					pass
				elif word.lower() in dict:
					index = dict.index(word.lower())
				else:
					raise IndexError("Word '%s' not found in the dictionary."%(word.lower()))

				if word == word.capitalize():
					chunks.append(b"\x82")
				elif word == word.upper():
					chunks.append(b"\x83")

				if word:
					if index < 0x20:
						chunks.append(bytes([index]))
					elif index < 0x2020:
						index -= 0x20
						chunks.append(bytes([
							(1 << 5) | ((index >> 8) & 0xFF),
							index & 0xFF
						]))
					else:
						index -= 0x2020
						chunks.append(bytes([
							(1 << 5) | ((index >> 16) & 0xFF),
							(index >> 8) & 0xFF,
							index & 0xFF
						]))

				if len(split_dash) > 1:
					chunks.append(b"\x84")

			if mark is not None:
				chunks.extend(mark[::-1])
		chunks.append(b"\x81")
	chunks.append(b"\x80")
	return dict, chunks

def binary_out(dict, chunks):
	out = b"ZTXT-B\r\n"
	DICT = b"\x00".join(map(
		lambda i: bytes(i, "utf-8"), dict))+b"\x00"
	dict_size = len(DICT)
	dict_size = bytes([
		(dict_size >> 24) & 0xFF,
		(dict_size >> 16) & 0xFF,
		(dict_size >> 8) & 0xFF,
		dict_size & 0xFF
	])
	DICT = b"DICT" + dict_size + DICT
	TEXT = b"".join(chunks)
	text_size = len(TEXT)
	text_size = bytes([
		(text_size >> 24) & 0xFF,
		(text_size >> 16) & 0xFF,
		(text_size >> 8) & 0xFF,
		text_size & 0xFF
	])
	TEXT = b"TEXT" + text_size + TEXT
	return out + DICT + TEXT

def parse_chunks(data):
	chunks = []
	index = 0
	while index < len(data):
		byte = data[index]
		if byte >= 0x80:
			chunks.append(byte)
		else:
			size = byte >> 5
			chunk = byte# & 0x1f
			if size == 1:
				chunk = (chunk << 8) | data[index+1]
				index += 1
			elif size == 2:
				chunk = (chunk << 8) | data[index+1]
				chunk = (chunk << 8) | data[index+2]
				index += 2
			chunks.append(chunk)
		index += 1
	return chunks

def binary_in(data):
	#Originally I wanted to parse more than one chunk, and
	# more than two kinds of chunks. The following is
	# (maybe) only temporary.
	header, data = data[:8], data[8:]
	index = 0
	dict, chunks = [], []
	while index < len(data):
		chunk_header = data[index:index+4]
		chunk_offset = index
		chunk_size = data[index+4:index+8]
		chunk_size = chunk_size[0] << 24 | \
			chunk_size[1] << 16 | \
			chunk_size[2] << 8 | \
			chunk_size[3]
		chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size]
		index += 8 + chunk_size
		if chunk_header == b"DICT":
			dict = list(map(lambda i: str(i, "utf-8"),
				chunk_data.split(b"\x00")))[:-1]
		elif chunk_header == b"TEXT":
			chunks = parse_chunks(chunk_data)
	return dict, chunks

LOWER, CAPITALIZE, UPPER = 0, 1, 2
PRE_PUNCTUATION = "([{'\"<"
POST_PUNCTUATION = ",.?!:;)]}'\">"
def decompress(dict, chunks):
	out = ""
	next = LOWER
	default_delimiter = " "
	delimiter, next_delimiter = "", " "
	for chunk in chunks:
		if chunk < 256 and chunk >= 0x80:
			chunk -= 0x80
			if chunk == 0: #E
				break
			elif chunk == 1: #R
				out += "\n"
				next_delimiter = delimiter = ""
			elif chunk == 2: #C
				next = CAPITALIZE
				next_delimiter = delimiter
			elif chunk == 3: #U
				next = UPPER
				next_delimiter = delimiter
			elif chunk == 4:
				next_delimiter = "-"
		else:
			size = chunk >> (5+
				(8 if chunk>=256 else 0)+
				(8 if chunk>=65536 else 0)) & 0x3
			if size < 3:
				if size == 0:
					index = (chunk & 0x1f)
				elif size == 1:
					index = (chunk & 0x1fff) + 0x20
				elif size == 2:
					index = (chunk & 0x1fffff) + 0x2020

				word = dict[index]
				if next == CAPITALIZE:
					word = word.capitalize()
				elif next == UPPER:
					word = word.upper()
				out += delimiter
				out += word
				next = LOWER
			else:
				index = chunk & 0x1f
				if index >= 16:
					index -= 16
					out += POST_PUNCTUATION[index]
				else:
					out += delimiter
					next_delimiter = PRE_PUNCTUATION[index]
		delimiter = next_delimiter
		next_delimiter = default_delimiter
	return out

def do_comprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "r").read()
	size = len(data)
	dict, chunks = compress(data)
	data = binary_out(dict, chunks)
	compressed_size = len(data)
	if file_out is None:
		file_out = file_in + ".ztxt"
	open(file_out, "wb").write(data)
	elapsed = time.time()-start
	print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size))

def do_decomprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "rb").read()
	compressed_size = len(data)
	dict, chunks = binary_in(data)
	data = decompress(dict, chunks)
	size = len(data)
	if file_out is None:
		file_out = file_in + ".txt"
	open(file_out, "w").write(data)
	elapsed = time.time() - start
	print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size))

if __name__ == "__main__":
	cmd = ""
	if len(sys.argv) >= 3:
		cmd = sys.argv[1]
		file_in = sys.argv[2]
		file_out = sys.argv[3] if len(sys.argv) > 3 else None

	if cmd == "c":
		do_comprime(file_in, file_out)
	elif cmd == "d":
		do_decomprime(file_in, file_out)
	else:
		print("""Usage
novel_zip {d|c} FILE_IN [FILE_OUT]

  d    Decompress the file.
  c    Compress the file.
""")
	#!/usr/bin/env python
	# -- coding: utf-8 --
	from __future__ import print_function, division
	import sys, time
	if sys.version_info.major < 3:
	input = raw_input

	def compress(data):
	freq = {}
	for line in data.splitlines():
	for word in line.split():
	while word and word[0] in PRE_PUNCTUATION:
	word = word[1:]
	while word and word[-1] in POST_PUNCTUATION:
	word = word[:-1]
	for word in word.split('-'):
	if not word:
	pass
	elif word.lower() in freq:
	freq[word.lower()] += 1
	else:
	freq[word.lower()] = 1
	dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1])
	dict = list(zip(*dict))[0]
	chunks = []

	for line in data.splitlines():
	for word in line.split():
	mark = []
	while word and word[0] in PRE_PUNCTUATION:
	chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])])
	chunks.append(chunk)
	word = word[1:]
	while word and word[-1] in POST_PUNCTUATION:
	chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])])
	mark.append(chunk)
	word = word[:-1]

	split_dash = word.split('-')
	for word in split_dash:
	if not word:
	pass
	elif word.lower() in dict:
	index = dict.index(word.lower())
	else:
	raise IndexError("Word '%s' not found in the dictionary."%(word.lower()))

	if word == word.capitalize():
	chunks.append(b"\x82")
	elif word == word.upper():
	chunks.append(b"\x83")

	if word:
	if index < 0x20:
	chunks.append(bytes([index]))
	elif index < 0x2020:
	index -= 0x20
	chunks.append(bytes([
	(1 << 5) \| ((index >> 8) & 0xFF),
	index & 0xFF
	]))
	else:
	index -= 0x2020
	chunks.append(bytes([
	(1 << 5) \| ((index >> 16) & 0xFF),
	(index >> 8) & 0xFF,
	index & 0xFF
	]))

	if len(split_dash) > 1:
	chunks.append(b"\x84")

	if mark is not None:
	chunks.extend(mark[::-1])
	chunks.append(b"\x81")
	chunks.append(b"\x80")
	return dict, chunks

	def binary_out(dict, chunks):
	out = b"ZTXT-B\r\n"
	DICT = b"\x00".join(map(
	lambda i: bytes(i, "utf-8"), dict))+b"\x00"
	dict_size = len(DICT)
	dict_size = bytes([
	(dict_size >> 24) & 0xFF,
	(dict_size >> 16) & 0xFF,
	(dict_size >> 8) & 0xFF,
	dict_size & 0xFF
	])
	DICT = b"DICT" + dict_size + DICT
	TEXT = b"".join(chunks)
	text_size = len(TEXT)
	text_size = bytes([
	(text_size >> 24) & 0xFF,
	(text_size >> 16) & 0xFF,
	(text_size >> 8) & 0xFF,
	text_size & 0xFF
	])
	TEXT = b"TEXT" + text_size + TEXT
	return out + DICT + TEXT

	def parse_chunks(data):
	chunks = []
	index = 0
	while index < len(data):
	byte = data[index]
	if byte >= 0x80:
	chunks.append(byte)
	else:
	size = byte >> 5
	chunk = byte# & 0x1f
	if size == 1:
	chunk = (chunk << 8) \| data[index+1]
	index += 1
	elif size == 2:
	chunk = (chunk << 8) \| data[index+1]
	chunk = (chunk << 8) \| data[index+2]
	index += 2
	chunks.append(chunk)
	index += 1
	return chunks

	def binary_in(data):
	#Originally I wanted to parse more than one chunk, and
	# more than two kinds of chunks. The following is
	# (maybe) only temporary.
	header, data = data[:8], data[8:]
	index = 0
	dict, chunks = [], []
	while index < len(data):
	chunk_header = data[index:index+4]
	chunk_offset = index
	chunk_size = data[index+4:index+8]
	chunk_size = chunk_size[0] << 24 \| \
	chunk_size[1] << 16 \| \
	chunk_size[2] << 8 \| \
	chunk_size[3]
	chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size]
	index += 8 + chunk_size
	if chunk_header == b"DICT":
	dict = list(map(lambda i: str(i, "utf-8"),
	chunk_data.split(b"\x00")))[:-1]
	elif chunk_header == b"TEXT":
	chunks = parse_chunks(chunk_data)
	return dict, chunks

	LOWER, CAPITALIZE, UPPER = 0, 1, 2
	PRE_PUNCTUATION = "([{'\"<"
	POST_PUNCTUATION = ",.?!:;)]}'\">"
	def decompress(dict, chunks):
	out = ""
	next = LOWER
	default_delimiter = " "
	delimiter, next_delimiter = "", " "
	for chunk in chunks:
	if chunk < 256 and chunk >= 0x80:
	chunk -= 0x80
	if chunk == 0: #E
	break
	elif chunk == 1: #R
	out += "\n"
	next_delimiter = delimiter = ""
	elif chunk == 2: #C
	next = CAPITALIZE
	next_delimiter = delimiter
	elif chunk == 3: #U
	next = UPPER
	next_delimiter = delimiter
	elif chunk == 4:
	next_delimiter = "-"
	else:
	size = chunk >> (5+
	(8 if chunk>=256 else 0)+
	(8 if chunk>=65536 else 0)) & 0x3
	if size < 3:
	if size == 0:
	index = (chunk & 0x1f)
	elif size == 1:
	index = (chunk & 0x1fff) + 0x20
	elif size == 2:
	index = (chunk & 0x1fffff) + 0x2020

	word = dict[index]
	if next == CAPITALIZE:
	word = word.capitalize()
	elif next == UPPER:
	word = word.upper()
	out += delimiter
	out += word
	next = LOWER
	else:
	index = chunk & 0x1f
	if index >= 16:
	index -= 16
	out += POST_PUNCTUATION[index]
	else:
	out += delimiter
	next_delimiter = PRE_PUNCTUATION[index]
	delimiter = next_delimiter
	next_delimiter = default_delimiter
	return out

	def do_comprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "r").read()
	size = len(data)
	dict, chunks = compress(data)
	data = binary_out(dict, chunks)
	compressed_size = len(data)
	if file_out is None:
	file_out = file_in + ".ztxt"
	open(file_out, "wb").write(data)
	elapsed = time.time()-start
	print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size))

	def do_decomprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "rb").read()
	compressed_size = len(data)
	dict, chunks = binary_in(data)
	data = decompress(dict, chunks)
	size = len(data)
	if file_out is None:
	file_out = file_in + ".txt"
	open(file_out, "w").write(data)
	elapsed = time.time() - start
	print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size))

	if __name__ == "__main__":
	cmd = ""
	if len(sys.argv) >= 3:
	cmd = sys.argv[1]
	file_in = sys.argv[2]
	file_out = sys.argv[3] if len(sys.argv) > 3 else None

	if cmd == "c":
	do_comprime(file_in, file_out)
	elif cmd == "d":
	do_decomprime(file_in, file_out)
	else:
	print("""Usage
	novel_zip {d\|c} FILE_IN [FILE_OUT]

	d Decompress the file.
	c Compress the file.
	""")