Last active
August 29, 2015 14:06
-
-
Save coyotebush/4302d211286ebbb75a8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import struct | |
import sys | |
import binascii | |
# def str2bytes(string): | |
# return "".join("\\x" + "{0:02x}".format(ord(x)) for x in string) | |
def compare(bs, ks, tag=None): | |
if bs != ks: | |
print "ERROR mismatched" | |
if tag is not None: | |
print "{0:02x}".format(tag) | |
for j in range(len(bs)): | |
print binascii.hexlify(bs[j]), binascii.hexlify(ks[j]), bs[j], ks[j] | |
sys.exit(1) | |
# 0x13 ^S - read 9 bytes | |
# even numbers 0x08 through 0x12 - | |
# take b/2 bytes from block identified by next 2 bytes | |
def decode(data, known): | |
c = 0 | |
chunks = [] | |
tagmap = dict() | |
mapfile = open('map.csv', 'w') | |
while c < len(data): | |
b = data[c]; c += 1 | |
if b == '\x13': | |
bs = data[c:c+9]; c += 9 | |
ks = known.read(9) | |
compare(bs, ks) | |
chunks.append(bs) | |
sys.stdout.write(bs) | |
elif b in ['\x08', '\x0a', '\x0c', '\x0e', '\x10', '\x12',]: | |
(tag,) = struct.unpack("<H", data[c:c+2]); c += 2 | |
l = ord(b)/2 | |
if tag in tagmap: | |
bs = chunks[tagmap[tag]][:l] | |
ks = known.read(l) | |
compare(bs, ks, tag) | |
sys.stdout.write(bs) | |
else: | |
m = known.read(l) | |
sys.stdout.write(m) | |
# print "$" | |
# print "0x{0:x} 0x{1:x}".format(ord(b), tag) | |
# print "what are the next {} bytes?".format(l) | |
# m = input() | |
# print m | |
# if len(m) != l: | |
# print "WARNING: that wasn't {} bytes".format(l) | |
for (j, chunk) in enumerate(chunks): | |
if chunk.find(m) == 0: | |
tagmap[tag] = j | |
if l == 9: | |
mapfile.write("0x{0:x}, {1:d}, \"{2}\"\n". | |
format(tag, j, binascii.hexlify(m))) | |
mapfile.flush() | |
break | |
else: | |
print "oops 0x{0:x}".format(ord(b)) | |
sys.exit(1) | |
# Usage: weissman.py weissman.csawlz pg28885.txt | |
infile = open(sys.argv[1]) | |
(magic, version, num_files) = struct.unpack("<8sii", infile.read(16)) | |
print(magic, version, num_files) | |
for i in range(num_files): | |
(magic, c_size, u_size, filename) = struct.unpack("<iii32s", infile.read(44)) | |
filename = filename.rstrip('\0') | |
print(magic, c_size, u_size, filename) | |
data = infile.read(c_size) | |
if filename == sys.argv[2]: | |
decode(data, open(sys.argv[2])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment