mxwell/dst_format_ctl.py

## dst_format_ctl.py
#! /usr/bin/env python3

"""
AutoCAD SheetSet file with *.dst extension is just an XML stream encoded with substitution cipher.
XML data is firstly encoded with UTF-8. This means characters might have different length (from one to four bytes).
It looks like substitution needs to be done on a per-character level rather than per-byte level.

== Restore conversion dictionary ==

Given two files "Crack.xml" and "Crack.dst", we can restore conversion dictionary (limited to charset used in these files) as follows:

python dst_format_ctl.py restore_dictionary --xml Crack.xml --dst Crack.dst --dictionary dictionary.json

The dictionary is stored into "dictionary.json". It includes mappings for both directions - XML to DST and vice versa.

== Convert *.dst to *.xml ==

Given "sheet.dst" and "dictionary.json", we can produce "sheet.xml":

python dst_format_ctl.py convert --direction dst_to_xml --dst sheet.dst --xml sheet.xml --dictionary dictionary.json

== Convert *.xml to *.dst ==

Given "sheet.xml" and "dictionary.json", we can produce "sheet.dst":

python dst_format_ctl.py convert --direction xml_to_dst --dst sheet.dst --xml sheet.xml --dictionary dictionary.json
"""

import argparse
import json
import logging
import sys


def load_sequence(filename):
    result = []
    with open(filename, "rb") as f:
        byte = f.read(1)
        while byte:
            result.append(ord(byte))
            byte = f.read(1)
    logging.info("Loaded sequence of %d byte(s) from %s", len(result), filename)
    return result


def save_sequence(sequence, filename):
    with open(filename, "wb") as output:
        for x in sequence:
            output.write(bytes([x]))
    logging.info("Sequence of %d byte(s) is saved to %s", len(sequence), filename)


def join_bytes(sequence, start, end):
    if end - 1 >= len(sequence):
        raise Exception("sequence is too short to extract {} bytes of unicode character at {}".format(end - start, start))
    result = 0
    for i in range(start, end):
        result = (result << 8) | sequence[i]
    return result


def split_bytes(encoding, span):
    result = []
    for i in range(span):
        result.append(encoding & 0xFF)
        encoding = encoding >> 8
    return reversed(result)


def restore_dictionary_command(args):
    xml = load_sequence(args.xml)
    dst = load_sequence(args.dst)

    cipher = dict()
    decipher = dict()
    assert len(xml) == len(dst), "both *.xml and *.dst files must be of the same length"
    n = len(xml)
    i = 0
    while i < n:
        b = xml[i]
        if (b >> 4) == 0xF:
            span = 4
        elif (b >> 5) == 0x7:
            span = 3
        elif (b >> 6) == 0x3:
            span = 2
        else:
            span = 1
        logging.debug("i %d, n %d, b %d, span %d", i, n, b, span)
        cur = join_bytes(xml, i, i + span)
        encoding = join_bytes(dst, i, i + span)
        if cur in cipher:
            if cipher[cur] != encoding:
                raise Exception("inconsistent encoding: {} was encoded as {} firstly and as {} afterwards".format(cur, cipher[cur], encoding))
        else:
            cipher[cur] = encoding
        if encoding in decipher:
            if decipher[encoding] != cur:
                raise Exception("inconsistent decoding: {} was decoded as {} firstly and as {} afterwards".format(encoding, decipher[encoding], cur))
        else:
            decipher[encoding] = cur
        i += span

    logging.info("Detected encoding of %d characters", len(cipher))
    logging.info("Detected decoding of %d characters", len(decipher))

    logging.info("Saving dictionary data to %s", args.dictionary)
    with open(args.dictionary, "w") as a_file:
        a_file.write(json.dumps({
            "dst_to_xml": decipher,
            "xml_to_dst": cipher,
        }))


def load_dictionary(path, direction):
    with open(path) as a_file:
        data = json.loads(a_file.read())
    result = data[direction]
    logging.info("Loaded dictionary of %d character(s) from %s for direction %s", len(result), path, direction)
    return result


def convert_command(args):
    if args.direction == "dst_to_xml":
        input_path = args.dst
        output_path = args.xml
    else:
        input_path = args.xml
        output_path = args.dst
    sequence = load_sequence(input_path)
    dictionary = load_dictionary(args.dictionary, args.direction)

    n = len(sequence)
    i = 0

    result = []
    while i < n:
        found = False
        for j in range(1, 5):
            span = j
            cur = str(join_bytes(sequence, i, i + span))
            logging.debug("trying %s with span %d", cur, span)
            if cur not in dictionary:
                continue
            bytes = split_bytes(dictionary[cur], span)
            result.extend(bytes)
            found = True
            break
        if not found:
            logging.error("failed to decode character sequence at %d, stopping here", i)
            break
        i += span
    save_sequence(result, output_path)


def main():
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(help="Modes of operation")

    restore_dictionary = subparsers.add_parser("restore_dictionary")
    restore_dictionary.set_defaults(func=restore_dictionary_command)
    restore_dictionary.add_argument("--xml", help="Path to original *.xml file", required=True)
    restore_dictionary.add_argument("--dst", help="Path to encoded *.dst file", required=True)
    restore_dictionary.add_argument("--dictionary", help="Path to save dictionary data", default="dictionary.json")

    convert = subparsers.add_parser("convert")
    convert.set_defaults(func=convert_command)
    convert.add_argument("--direction", help="Direction of conversion", choices=["xml_to_dst", "dst_to_xml"])
    convert.add_argument("--xml", help="Path to original *.xml file", required=True)
    convert.add_argument("--dst", help="Path to encoded *.dst file", required=True)
    convert.add_argument("--dictionary", help="Path to load dictionary data from", default="dictionary.json")

    args = parser.parse_args()
    args.func(args)
    return 0


if __name__ == "__main__":
    sys.exit(main())
	#! /usr/bin/env python3

	"""
	AutoCAD SheetSet file with *.dst extension is just an XML stream encoded with substitution cipher.
	XML data is firstly encoded with UTF-8. This means characters might have different length (from one to four bytes).
	It looks like substitution needs to be done on a per-character level rather than per-byte level.

	== Restore conversion dictionary ==

	Given two files "Crack.xml" and "Crack.dst", we can restore conversion dictionary (limited to charset used in these files) as follows:

	python dst_format_ctl.py restore_dictionary --xml Crack.xml --dst Crack.dst --dictionary dictionary.json

	The dictionary is stored into "dictionary.json". It includes mappings for both directions - XML to DST and vice versa.

	== Convert .dst to .xml ==

	Given "sheet.dst" and "dictionary.json", we can produce "sheet.xml":

	python dst_format_ctl.py convert --direction dst_to_xml --dst sheet.dst --xml sheet.xml --dictionary dictionary.json

	== Convert .xml to .dst ==

	Given "sheet.xml" and "dictionary.json", we can produce "sheet.dst":

	python dst_format_ctl.py convert --direction xml_to_dst --dst sheet.dst --xml sheet.xml --dictionary dictionary.json
	"""

	import argparse
	import json
	import logging
	import sys


	def load_sequence(filename):
	result = []
	with open(filename, "rb") as f:
	byte = f.read(1)
	while byte:
	result.append(ord(byte))
	byte = f.read(1)
	logging.info("Loaded sequence of %d byte(s) from %s", len(result), filename)
	return result


	def save_sequence(sequence, filename):
	with open(filename, "wb") as output:
	for x in sequence:
	output.write(bytes([x]))
	logging.info("Sequence of %d byte(s) is saved to %s", len(sequence), filename)


	def join_bytes(sequence, start, end):
	if end - 1 >= len(sequence):
	raise Exception("sequence is too short to extract {} bytes of unicode character at {}".format(end - start, start))
	result = 0
	for i in range(start, end):
	result = (result << 8) \| sequence[i]
	return result


	def split_bytes(encoding, span):
	result = []
	for i in range(span):
	result.append(encoding & 0xFF)
	encoding = encoding >> 8
	return reversed(result)


	def restore_dictionary_command(args):
	xml = load_sequence(args.xml)
	dst = load_sequence(args.dst)

	cipher = dict()
	decipher = dict()
	assert len(xml) == len(dst), "both .xml and .dst files must be of the same length"
	n = len(xml)
	i = 0
	while i < n:
	b = xml[i]
	if (b >> 4) == 0xF:
	span = 4
	elif (b >> 5) == 0x7:
	span = 3
	elif (b >> 6) == 0x3:
	span = 2
	else:
	span = 1
	logging.debug("i %d, n %d, b %d, span %d", i, n, b, span)
	cur = join_bytes(xml, i, i + span)
	encoding = join_bytes(dst, i, i + span)
	if cur in cipher:
	if cipher[cur] != encoding:
	raise Exception("inconsistent encoding: {} was encoded as {} firstly and as {} afterwards".format(cur, cipher[cur], encoding))
	else:
	cipher[cur] = encoding
	if encoding in decipher:
	if decipher[encoding] != cur:
	raise Exception("inconsistent decoding: {} was decoded as {} firstly and as {} afterwards".format(encoding, decipher[encoding], cur))
	else:
	decipher[encoding] = cur
	i += span

	logging.info("Detected encoding of %d characters", len(cipher))
	logging.info("Detected decoding of %d characters", len(decipher))

	logging.info("Saving dictionary data to %s", args.dictionary)
	with open(args.dictionary, "w") as a_file:
	a_file.write(json.dumps({
	"dst_to_xml": decipher,
	"xml_to_dst": cipher,
	}))


	def load_dictionary(path, direction):
	with open(path) as a_file:
	data = json.loads(a_file.read())
	result = data[direction]
	logging.info("Loaded dictionary of %d character(s) from %s for direction %s", len(result), path, direction)
	return result


	def convert_command(args):
	if args.direction == "dst_to_xml":
	input_path = args.dst
	output_path = args.xml
	else:
	input_path = args.xml
	output_path = args.dst
	sequence = load_sequence(input_path)
	dictionary = load_dictionary(args.dictionary, args.direction)

	n = len(sequence)
	i = 0

	result = []
	while i < n:
	found = False
	for j in range(1, 5):
	span = j
	cur = str(join_bytes(sequence, i, i + span))
	logging.debug("trying %s with span %d", cur, span)
	if cur not in dictionary:
	continue
	bytes = split_bytes(dictionary[cur], span)
	result.extend(bytes)
	found = True
	break
	if not found:
	logging.error("failed to decode character sequence at %d, stopping here", i)
	break
	i += span
	save_sequence(result, output_path)


	def main():
	logging.basicConfig(level=logging.INFO)
	parser = argparse.ArgumentParser()
	subparsers = parser.add_subparsers(help="Modes of operation")

	restore_dictionary = subparsers.add_parser("restore_dictionary")
	restore_dictionary.set_defaults(func=restore_dictionary_command)
	restore_dictionary.add_argument("--xml", help="Path to original *.xml file", required=True)
	restore_dictionary.add_argument("--dst", help="Path to encoded *.dst file", required=True)
	restore_dictionary.add_argument("--dictionary", help="Path to save dictionary data", default="dictionary.json")

	convert = subparsers.add_parser("convert")
	convert.set_defaults(func=convert_command)
	convert.add_argument("--direction", help="Direction of conversion", choices=["xml_to_dst", "dst_to_xml"])
	convert.add_argument("--xml", help="Path to original *.xml file", required=True)
	convert.add_argument("--dst", help="Path to encoded *.dst file", required=True)
	convert.add_argument("--dictionary", help="Path to load dictionary data from", default="dictionary.json")

	args = parser.parse_args()
	args.func(args)
	return 0


	if __name__ == "__main__":
	sys.exit(main())