tpwrules/twc2txt.py

## twc2txt.py
# This Python 3 script extracts cards from TextWare cardfiles
# it dumps every card with its number and name to the directory called "output"
# under the current directory
# first and only argument is the .TWC file to extract
# license and format information are located at the bottom of the file
#
# PLEASE NOTE!
# This is designed to only extract text. I have not found a cardfile with pictures
# so I can't say how it performs. Also, many articles have typos and glitches.
# Check for them with the original viewer before concluding there is a bug.

import os, sys
import traceback
from struct import Struct, unpack
fin = open(sys.argv[1], "rb")
os.chdir("output")

si = Struct("<I")

fin.seek(0x04)
if si.unpack(fin.read(4))[0]&0xffff != 0x0f0c:
    print("Not a .twc file")
    exit(1)

fin.seek(0x160)
name, = unpack("60s", fin.read(60))
print("Name: "+name.rstrip(b'\x00').decode("ascii"))

fin.seek(0x3B0)
num_cards, = si.unpack(fin.read(4))
print("Number of cards:", num_cards)

# now get the first card pointer
# it's at a fixed location in the file (I think), but part of some double nested table thing
# but fortunately cards are in a linked list so we don't have to parse it
fin.seek(0xC00)
card_ptr, = si.unpack(fin.read(4))
card_pos = 1

card_hdr = Struct("<IIIIIIHH68s")
card_hdr_len = card_hdr.size
card_ind_hdr = Struct("<IHH")
card_ind_hdr_len = card_ind_hdr.size
chunk_hdr = Struct("<IIIH")
chunk_hdr_len = chunk_hdr.size

# set up for making filenames
# generate translation to turn all non alnum into _
name_translate = b''.join(c if (c.isalnum() or c == b" ") else b"_" for c in map(lambda x: bytes([x]), range(256)))
# format to keep card files sorted
card_name_format = "{:0"+str(len(str(num_cards)))+"} - {}.txt"

def decompress_block(bytedata):
    # first we have to convert to 12 bit words
    def wordgen(bd):
        pos = 0
        while True:
            # generator is stopped by caller
            x = bd[pos:pos+3]
            pos += 3
            yield (x[0]<<4)+(x[1]>>4)
            yield ((x[1]&0x0F)<<8)+x[2]
    block = []
    for word in wordgen(bytedata):
        block.append(word)
        if word == 0xFFF: break

    def go(pos=0, count=None):
        out = bytearray()
        in_words = 0
        while True:
            word = block[pos]
            if word == 0xFFF: break
            if word < 256:
                out.append(word)
            else:
                if count is None:
                    t = go(word-256, 2)
                else:
                    t = go(word-256, count-in_words)
                out.extend(t)
            in_words += 1
            if in_words == count:
                break
            pos += 1
        return out
    try:
        return go()
    except:
        raise Exception("Decompression error.")

last_data_ptr = None
last_data = None

def process(expected_num, card_ptr):
    global last_data_ptr, last_data
    fin.seek(card_ptr)
    num, prev, next_card_ptr, u1, t1, chunk_ptr, t2, length, name = \
        card_hdr.unpack(fin.read(card_hdr_len))
    if expected_num != num:
        raise Exception("Error following linked list. Something is very wrong. 0x{:08X}".format(card_ptr))
    print(num)
    typle = (t1, t2)
    if typle == (1, 7): # indirect compressed
        data_ptr, offset, data_len = \
            card_ind_hdr.unpack(fin.read(card_ind_hdr_len))
        # optimization to avoid decompressing data multiple times
        if last_data_ptr == data_ptr:
            data = last_data
        else:
            fin.seek(data_ptr)
            data = decompress_block(fin.read(data_len))
            last_data_ptr = data_ptr
            last_data = data
        text = data[offset:(offset+length-1)]
    elif typle in ((5, 2), (1, 2)): #direct
        chunks = []
        data = fin.read(length)
        if typle == (5, 2): # compressed
            data = decompress_block(data)
        chunks.append(data[:-1]) # remove terminating 00
        # load any additional chunks if necessary
        while chunk_ptr != 0:
            fin.seek(chunk_ptr)
            cprev, chunk_ptr, t1, length = \
                chunk_hdr.unpack(fin.read(chunk_hdr_len))
            data = fin.read(length)
            if t1 == 4:
                data = decompress_block(data)
            elif t1 == 0:
                pass
            else:
                raise Exception("Error at card #{}, 0x{:08x}".format(num, card_ptr))
            chunks.append(data[:-1])
        text = b''.join(chunks)
    else:
        raise Exception("Card of unknown type #{}, 0x{:08x}, TYPE: {}".format(num, card_ptr, typle))

    tname = name.translate(name_translate).decode("ascii").strip("_")
    fout = open(card_name_format.format(num, tname), "wb")
    fout.write(text)
    fout.close()
    return next_card_ptr

errors = []

while card_pos <= num_cards:
    try:
        card_ptr = process(card_pos, card_ptr)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        if len(e.args) == 1 and e.args[0].startswith("Error following linked list. Something is very wrong. "):
            raise
        errors.append((card_pos, card_ptr, traceback.format_exc()))
        #get next pointer anyway
        #hopefully we aren't completely lost
        fin.seek(card_ptr+8)
        card_ptr, = si.unpack(fin.read(4))
    card_pos += 1

if len(errors) == 0:
    print("All cards extracted successfully.")
    exit(0)

print("There were errors extracting some cards.")
for error in errors:
    print("CARD: {} ADDRESS: {:08X}".format(error[0], error[1]))
    print(error[2])

print("If the cards display correctly in the original viewer, tell me this stuff.")

# text is 00 terminated
# also note that text is stored with 0A line ending
# this decoder outputs that verbatim
# the viewer program converts this to dos 0D 0A

#5,2: direct compressed
# L: compressed length
#1,2: direct uncompressed
# L: uncompressed length

#it is possible for direct cards to be in multiple chunks
#pointer to next chunk is in U3
#L only applies to the first chunk
# chunk header
# P, N, U3, L
# 4, 4, 4, 2
# L: length of this chunk (compressed length if chunk is compressed)
# U3: 4 if compressed, 0 if uncompressed
# N: pointer to next chunk
# P: pointer to previous chunk (will not point to card header!)

#chunks are simply concatenated together after decompression (if applicable)
#(being mindful of terminating 00s)

#1,7: indirect compressed
# L: uncompressed length, A1: compressed block pointer A2: uncompressed offset
# A3: compressed block length

# this type allows multiple cards to be stored in one compressed block.
# the card is stored at position A2 in the uncompressed block
# and has uncompressed length L

#BIZARRE TYPES
#340,2
# seems to be a direct compressed type, but with no name. the data begins directly
# after the length word. the program treats its title as the first characters of
# the card. confusingly, attempting to go to this card by number gives an error
# that the card has been deleted

#compression is very simple backreferencing scheme
#stream is encoded as 12 bit words
# $AA $AB $BB
# A and B are two 12 bit words, stored big endian
# word with high nibble 0 is literal, low byte is output directly
# otherwise a backreference! begin processing from the beginning of the stream
# at offset (word-256)
#
# there is no length! instead we have 2 easy rules
# given offset = word-256, process the words at offset and offset+1
# 1. if the word at offset is yet another offset,
# process two words at that offset and offset+1, recursively
# 2. if the word at offset+1 is another offset, process ONE word at offset, recursively
# both these rules apply to any recursions
# this means the backreference length depends entirely on what is being backreferenced
#
# word of FFF seems to indicate end of compressed block

# Copyright (c) 2016 Thomas Watson (@tpw_rules)

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
	# This Python 3 script extracts cards from TextWare cardfiles
	# it dumps every card with its number and name to the directory called "output"
	# under the current directory
	# first and only argument is the .TWC file to extract
	# license and format information are located at the bottom of the file
	#
	# PLEASE NOTE!
	# This is designed to only extract text. I have not found a cardfile with pictures
	# so I can't say how it performs. Also, many articles have typos and glitches.
	# Check for them with the original viewer before concluding there is a bug.

	import os, sys
	import traceback
	from struct import Struct, unpack
	fin = open(sys.argv[1], "rb")
	os.chdir("output")

	si = Struct("<I")

	fin.seek(0x04)
	if si.unpack(fin.read(4))[0]&0xffff != 0x0f0c:
	print("Not a .twc file")
	exit(1)

	fin.seek(0x160)
	name, = unpack("60s", fin.read(60))
	print("Name: "+name.rstrip(b'\x00').decode("ascii"))

	fin.seek(0x3B0)
	num_cards, = si.unpack(fin.read(4))
	print("Number of cards:", num_cards)

	# now get the first card pointer
	# it's at a fixed location in the file (I think), but part of some double nested table thing
	# but fortunately cards are in a linked list so we don't have to parse it
	fin.seek(0xC00)
	card_ptr, = si.unpack(fin.read(4))
	card_pos = 1

	card_hdr = Struct("<IIIIIIHH68s")
	card_hdr_len = card_hdr.size
	card_ind_hdr = Struct("<IHH")
	card_ind_hdr_len = card_ind_hdr.size
	chunk_hdr = Struct("<IIIH")
	chunk_hdr_len = chunk_hdr.size

	# set up for making filenames
	# generate translation to turn all non alnum into _
	name_translate = b''.join(c if (c.isalnum() or c == b" ") else b"_" for c in map(lambda x: bytes([x]), range(256)))
	# format to keep card files sorted
	card_name_format = "{:0"+str(len(str(num_cards)))+"} - {}.txt"

	def decompress_block(bytedata):
	# first we have to convert to 12 bit words
	def wordgen(bd):
	pos = 0
	while True:
	# generator is stopped by caller
	x = bd[pos:pos+3]
	pos += 3
	yield (x[0]<<4)+(x[1]>>4)
	yield ((x[1]&0x0F)<<8)+x[2]
	block = []
	for word in wordgen(bytedata):
	block.append(word)
	if word == 0xFFF: break

	def go(pos=0, count=None):
	out = bytearray()
	in_words = 0
	while True:
	word = block[pos]
	if word == 0xFFF: break
	if word < 256:
	out.append(word)
	else:
	if count is None:
	t = go(word-256, 2)
	else:
	t = go(word-256, count-in_words)
	out.extend(t)
	in_words += 1
	if in_words == count:
	break
	pos += 1
	return out
	try:
	return go()
	except:
	raise Exception("Decompression error.")

	last_data_ptr = None
	last_data = None

	def process(expected_num, card_ptr):
	global last_data_ptr, last_data
	fin.seek(card_ptr)
	num, prev, next_card_ptr, u1, t1, chunk_ptr, t2, length, name = \
	card_hdr.unpack(fin.read(card_hdr_len))
	if expected_num != num:
	raise Exception("Error following linked list. Something is very wrong. 0x{:08X}".format(card_ptr))
	print(num)
	typle = (t1, t2)
	if typle == (1, 7): # indirect compressed
	data_ptr, offset, data_len = \
	card_ind_hdr.unpack(fin.read(card_ind_hdr_len))
	# optimization to avoid decompressing data multiple times
	if last_data_ptr == data_ptr:
	data = last_data
	else:
	fin.seek(data_ptr)
	data = decompress_block(fin.read(data_len))
	last_data_ptr = data_ptr
	last_data = data
	text = data[offset:(offset+length-1)]
	elif typle in ((5, 2), (1, 2)): #direct
	chunks = []
	data = fin.read(length)
	if typle == (5, 2): # compressed
	data = decompress_block(data)
	chunks.append(data[:-1]) # remove terminating 00
	# load any additional chunks if necessary
	while chunk_ptr != 0:
	fin.seek(chunk_ptr)
	cprev, chunk_ptr, t1, length = \
	chunk_hdr.unpack(fin.read(chunk_hdr_len))
	data = fin.read(length)
	if t1 == 4:
	data = decompress_block(data)
	elif t1 == 0:
	pass
	else:
	raise Exception("Error at card #{}, 0x{:08x}".format(num, card_ptr))
	chunks.append(data[:-1])
	text = b''.join(chunks)
	else:
	raise Exception("Card of unknown type #{}, 0x{:08x}, TYPE: {}".format(num, card_ptr, typle))

	tname = name.translate(name_translate).decode("ascii").strip("_")
	fout = open(card_name_format.format(num, tname), "wb")
	fout.write(text)
	fout.close()
	return next_card_ptr

	errors = []

	while card_pos <= num_cards:
	try:
	card_ptr = process(card_pos, card_ptr)
	except KeyboardInterrupt:
	raise
	except Exception as e:
	if len(e.args) == 1 and e.args[0].startswith("Error following linked list. Something is very wrong. "):
	raise
	errors.append((card_pos, card_ptr, traceback.format_exc()))
	#get next pointer anyway
	#hopefully we aren't completely lost
	fin.seek(card_ptr+8)
	card_ptr, = si.unpack(fin.read(4))
	card_pos += 1

	if len(errors) == 0:
	print("All cards extracted successfully.")
	exit(0)

	print("There were errors extracting some cards.")
	for error in errors:
	print("CARD: {} ADDRESS: {:08X}".format(error[0], error[1]))
	print(error[2])

	print("If the cards display correctly in the original viewer, tell me this stuff.")

	# text is 00 terminated
	# also note that text is stored with 0A line ending
	# this decoder outputs that verbatim
	# the viewer program converts this to dos 0D 0A

	#5,2: direct compressed
	# L: compressed length
	#1,2: direct uncompressed
	# L: uncompressed length

	#it is possible for direct cards to be in multiple chunks
	#pointer to next chunk is in U3
	#L only applies to the first chunk
	# chunk header
	# P, N, U3, L
	# 4, 4, 4, 2
	# L: length of this chunk (compressed length if chunk is compressed)
	# U3: 4 if compressed, 0 if uncompressed
	# N: pointer to next chunk
	# P: pointer to previous chunk (will not point to card header!)

	#chunks are simply concatenated together after decompression (if applicable)
	#(being mindful of terminating 00s)

	#1,7: indirect compressed
	# L: uncompressed length, A1: compressed block pointer A2: uncompressed offset
	# A3: compressed block length

	# this type allows multiple cards to be stored in one compressed block.
	# the card is stored at position A2 in the uncompressed block
	# and has uncompressed length L

	#BIZARRE TYPES
	#340,2
	# seems to be a direct compressed type, but with no name. the data begins directly
	# after the length word. the program treats its title as the first characters of
	# the card. confusingly, attempting to go to this card by number gives an error
	# that the card has been deleted

	#compression is very simple backreferencing scheme
	#stream is encoded as 12 bit words
	# $AA $AB $BB
	# A and B are two 12 bit words, stored big endian
	# word with high nibble 0 is literal, low byte is output directly
	# otherwise a backreference! begin processing from the beginning of the stream
	# at offset (word-256)
	#
	# there is no length! instead we have 2 easy rules
	# given offset = word-256, process the words at offset and offset+1
	# 1. if the word at offset is yet another offset,
	# process two words at that offset and offset+1, recursively
	# 2. if the word at offset+1 is another offset, process ONE word at offset, recursively
	# both these rules apply to any recursions
	# this means the backreference length depends entirely on what is being backreferenced
	#
	# word of FFF seems to indicate end of compressed block

	# Copyright (c) 2016 Thomas Watson (@tpw_rules)

	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the "Software"), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	# of the Software, and to permit persons to whom the Software is furnished to do
	# so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	# IN THE SOFTWARE.