theiostream/idealist-to-csv.py

## idealist-to-csv.py
#!/usr/bin/python

# converts Blackwell IdeaList .TEX file to CSV
# (c) 2019 Daniel Ferreira

from array import array
from datetime import date
import csv

FIELD_TYPE_STRING = 0
FIELD_TYPE_DATE = 1

STATE_START = 0
STATE_READING_FIELD_SPEC = 1
STATE_READING_FIELD_DATA = 2

NICE_KEY = {
    '8506': 'name',
    '0b7f': 'title',
    'c903': 'christname',
    '2103': 'date',
    'd619': 'type',
    'ab3d': 'subject',
    '0203': 'bill_name',
    '0433': 'hoppit_subj',
    '840c': 'question',
    'a807': 'procon',
    '0000': 'other'
}

def round_down(n, div):
    return n - (n % div)

# The date format here is weird.
def parse_date(dd):
    date_byte1 = dd[2]
    date_byte2 = dd[3]

    year = int(((date_byte1 - 0x34) / 2) + 1690)

    byte1_odd = date_byte1 % 2 != 0
    rd = round_down(date_byte2, 0x20)

    month_ = int(rd / 0x20)
    if byte1_odd is True:
        month = month_ + 8
    else:
        month = month_

    day = date_byte2 - rd

    return date(year, month, day).strftime('%Y-%m-%d')

with open('JOURNALS.TEX', 'rb') as f:
    data = array('B', f.read())
    final_len = len(data)

    output = []
    cum_obj = {}

    state = STATE_START
    idx = 0

    field_type = FIELD_TYPE_STRING
    field_len = 0

    while True:
        if idx >= final_len:
            break

        if state == STATE_START:
            if data[idx+8] != 0xab:
                idx += 1
                continue

            cum_obj['id'] = bytes(data[idx:idx+3]).hex()

            state = STATE_READING_FIELD_SPEC
            idx += 10

        elif state == STATE_READING_FIELD_SPEC:
            if data[idx] == 0x57 and data[idx+1] == 0x20:
                field_type = FIELD_TYPE_STRING
            elif data[idx] == 0x44 and data[idx+1] == 0x01:
                field_type = FIELD_TYPE_DATE
            else:
                raise Exception('bad field type')

            field_len = data[idx+2]

            state = STATE_READING_FIELD_DATA
            idx += 4

        elif state == STATE_READING_FIELD_DATA:
            key = NICE_KEY[bytes(data[idx+field_len:idx+field_len+2]).hex()]

            if field_type == FIELD_TYPE_STRING:
                cum_obj[key] = bytes(data[idx:idx+field_len]).decode('utf-8', errors = 'ignore')
            elif field_type == FIELD_TYPE_DATE:
                cum_obj[key] = parse_date(data[idx:idx+field_len])
            else:
                raise Exception('Bad field type')

            if data[idx+field_len+2] == 0x00:
                output.append(cum_obj)
                cum_obj = {}

                state = STATE_START
                idx += field_len + 2 + 13
            else:
                state = STATE_READING_FIELD_SPEC
                idx += field_len + 2

    keys = output[0].keys()
    with open('output.csv', 'w') as oupf:
        dict_writer = csv.DictWriter(oupf, keys)
        dict_writer.writeheader()
        dict_writer.writerows(output)
	#!/usr/bin/python

	# converts Blackwell IdeaList .TEX file to CSV
	# (c) 2019 Daniel Ferreira

	from array import array
	from datetime import date
	import csv

	FIELD_TYPE_STRING = 0
	FIELD_TYPE_DATE = 1

	STATE_START = 0
	STATE_READING_FIELD_SPEC = 1
	STATE_READING_FIELD_DATA = 2

	NICE_KEY = {
	'8506': 'name',
	'0b7f': 'title',
	'c903': 'christname',
	'2103': 'date',
	'd619': 'type',
	'ab3d': 'subject',
	'0203': 'bill_name',
	'0433': 'hoppit_subj',
	'840c': 'question',
	'a807': 'procon',
	'0000': 'other'
	}

	def round_down(n, div):
	return n - (n % div)

	# The date format here is weird.
	def parse_date(dd):
	date_byte1 = dd[2]
	date_byte2 = dd[3]

	year = int(((date_byte1 - 0x34) / 2) + 1690)

	byte1_odd = date_byte1 % 2 != 0
	rd = round_down(date_byte2, 0x20)

	month_ = int(rd / 0x20)
	if byte1_odd is True:
	month = month_ + 8
	else:
	month = month_

	day = date_byte2 - rd

	return date(year, month, day).strftime('%Y-%m-%d')

	with open('JOURNALS.TEX', 'rb') as f:
	data = array('B', f.read())
	final_len = len(data)

	output = []
	cum_obj = {}

	state = STATE_START
	idx = 0

	field_type = FIELD_TYPE_STRING
	field_len = 0

	while True:
	if idx >= final_len:
	break

	if state == STATE_START:
	if data[idx+8] != 0xab:
	idx += 1
	continue

	cum_obj['id'] = bytes(data[idx:idx+3]).hex()

	state = STATE_READING_FIELD_SPEC
	idx += 10

	elif state == STATE_READING_FIELD_SPEC:
	if data[idx] == 0x57 and data[idx+1] == 0x20:
	field_type = FIELD_TYPE_STRING
	elif data[idx] == 0x44 and data[idx+1] == 0x01:
	field_type = FIELD_TYPE_DATE
	else:
	raise Exception('bad field type')

	field_len = data[idx+2]

	state = STATE_READING_FIELD_DATA
	idx += 4

	elif state == STATE_READING_FIELD_DATA:
	key = NICE_KEY[bytes(data[idx+field_len:idx+field_len+2]).hex()]

	if field_type == FIELD_TYPE_STRING:
	cum_obj[key] = bytes(data[idx:idx+field_len]).decode('utf-8', errors = 'ignore')
	elif field_type == FIELD_TYPE_DATE:
	cum_obj[key] = parse_date(data[idx:idx+field_len])
	else:
	raise Exception('Bad field type')

	if data[idx+field_len+2] == 0x00:
	output.append(cum_obj)
	cum_obj = {}

	state = STATE_START
	idx += field_len + 2 + 13
	else:
	state = STATE_READING_FIELD_SPEC
	idx += field_len + 2

	keys = output[0].keys()
	with open('output.csv', 'w') as oupf:
	dict_writer = csv.DictWriter(oupf, keys)
	dict_writer.writeheader()
	dict_writer.writerows(output)