Skip to content

Instantly share code, notes, and snippets.

@fbwright
Created March 28, 2015 17:43
Show Gist options
  • Save fbwright/7674756039f0a4ad5484 to your computer and use it in GitHub Desktop.
Save fbwright/7674756039f0a4ad5484 to your computer and use it in GitHub Desktop.
Novel compressor
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import sys, time
if sys.version_info.major < 3:
input = raw_input
def compress(data):
freq = {}
for line in data.splitlines():
for word in line.split():
while word and word[0] in PRE_PUNCTUATION:
word = word[1:]
while word and word[-1] in POST_PUNCTUATION:
word = word[:-1]
for word in word.split('-'):
if not word:
pass
elif word.lower() in freq:
freq[word.lower()] += 1
else:
freq[word.lower()] = 1
dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1])
dict = list(zip(*dict))[0]
chunks = []
for line in data.splitlines():
for word in line.split():
mark = []
while word and word[0] in PRE_PUNCTUATION:
chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])])
chunks.append(chunk)
word = word[1:]
while word and word[-1] in POST_PUNCTUATION:
chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])])
mark.append(chunk)
word = word[:-1]
split_dash = word.split('-')
for word in split_dash:
if not word:
pass
elif word.lower() in dict:
index = dict.index(word.lower())
else:
raise IndexError("Word '%s' not found in the dictionary."%(word.lower()))
if word == word.capitalize():
chunks.append(b"\x82")
elif word == word.upper():
chunks.append(b"\x83")
if word:
if index < 0x20:
chunks.append(bytes([index]))
elif index < 0x2020:
index -= 0x20
chunks.append(bytes([
(1 << 5) | ((index >> 8) & 0xFF),
index & 0xFF
]))
else:
index -= 0x2020
chunks.append(bytes([
(1 << 5) | ((index >> 16) & 0xFF),
(index >> 8) & 0xFF,
index & 0xFF
]))
if len(split_dash) > 1:
chunks.append(b"\x84")
if mark is not None:
chunks.extend(mark[::-1])
chunks.append(b"\x81")
chunks.append(b"\x80")
return dict, chunks
def binary_out(dict, chunks):
out = b"ZTXT-B\r\n"
DICT = b"\x00".join(map(
lambda i: bytes(i, "utf-8"), dict))+b"\x00"
dict_size = len(DICT)
dict_size = bytes([
(dict_size >> 24) & 0xFF,
(dict_size >> 16) & 0xFF,
(dict_size >> 8) & 0xFF,
dict_size & 0xFF
])
DICT = b"DICT" + dict_size + DICT
TEXT = b"".join(chunks)
text_size = len(TEXT)
text_size = bytes([
(text_size >> 24) & 0xFF,
(text_size >> 16) & 0xFF,
(text_size >> 8) & 0xFF,
text_size & 0xFF
])
TEXT = b"TEXT" + text_size + TEXT
return out + DICT + TEXT
def parse_chunks(data):
chunks = []
index = 0
while index < len(data):
byte = data[index]
if byte >= 0x80:
chunks.append(byte)
else:
size = byte >> 5
chunk = byte# & 0x1f
if size == 1:
chunk = (chunk << 8) | data[index+1]
index += 1
elif size == 2:
chunk = (chunk << 8) | data[index+1]
chunk = (chunk << 8) | data[index+2]
index += 2
chunks.append(chunk)
index += 1
return chunks
def binary_in(data):
#Originally I wanted to parse more than one chunk, and
# more than two kinds of chunks. The following is
# (maybe) only temporary.
header, data = data[:8], data[8:]
index = 0
dict, chunks = [], []
while index < len(data):
chunk_header = data[index:index+4]
chunk_offset = index
chunk_size = data[index+4:index+8]
chunk_size = chunk_size[0] << 24 | \
chunk_size[1] << 16 | \
chunk_size[2] << 8 | \
chunk_size[3]
chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size]
index += 8 + chunk_size
if chunk_header == b"DICT":
dict = list(map(lambda i: str(i, "utf-8"),
chunk_data.split(b"\x00")))[:-1]
elif chunk_header == b"TEXT":
chunks = parse_chunks(chunk_data)
return dict, chunks
LOWER, CAPITALIZE, UPPER = 0, 1, 2
PRE_PUNCTUATION = "([{'\"<"
POST_PUNCTUATION = ",.?!:;)]}'\">"
def decompress(dict, chunks):
out = ""
next = LOWER
default_delimiter = " "
delimiter, next_delimiter = "", " "
for chunk in chunks:
if chunk < 256 and chunk >= 0x80:
chunk -= 0x80
if chunk == 0: #E
break
elif chunk == 1: #R
out += "\n"
next_delimiter = delimiter = ""
elif chunk == 2: #C
next = CAPITALIZE
next_delimiter = delimiter
elif chunk == 3: #U
next = UPPER
next_delimiter = delimiter
elif chunk == 4:
next_delimiter = "-"
else:
size = chunk >> (5+
(8 if chunk>=256 else 0)+
(8 if chunk>=65536 else 0)) & 0x3
if size < 3:
if size == 0:
index = (chunk & 0x1f)
elif size == 1:
index = (chunk & 0x1fff) + 0x20
elif size == 2:
index = (chunk & 0x1fffff) + 0x2020
word = dict[index]
if next == CAPITALIZE:
word = word.capitalize()
elif next == UPPER:
word = word.upper()
out += delimiter
out += word
next = LOWER
else:
index = chunk & 0x1f
if index >= 16:
index -= 16
out += POST_PUNCTUATION[index]
else:
out += delimiter
next_delimiter = PRE_PUNCTUATION[index]
delimiter = next_delimiter
next_delimiter = default_delimiter
return out
def do_comprime(file_in, file_out=None):
start = time.time()
data = open(file_in, "r").read()
size = len(data)
dict, chunks = compress(data)
data = binary_out(dict, chunks)
compressed_size = len(data)
if file_out is None:
file_out = file_in + ".ztxt"
open(file_out, "wb").write(data)
elapsed = time.time()-start
print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size))
def do_decomprime(file_in, file_out=None):
start = time.time()
data = open(file_in, "rb").read()
compressed_size = len(data)
dict, chunks = binary_in(data)
data = decompress(dict, chunks)
size = len(data)
if file_out is None:
file_out = file_in + ".txt"
open(file_out, "w").write(data)
elapsed = time.time() - start
print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size))
if __name__ == "__main__":
cmd = ""
if len(sys.argv) >= 3:
cmd = sys.argv[1]
file_in = sys.argv[2]
file_out = sys.argv[3] if len(sys.argv) > 3 else None
if cmd == "c":
do_comprime(file_in, file_out)
elif cmd == "d":
do_decomprime(file_in, file_out)
else:
print("""Usage
novel_zip {d|c} FILE_IN [FILE_OUT]
d Decompress the file.
c Compress the file.
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment