epoz/STCN raw parser

## STCN raw parser
'''
Read in a STCN data dump file, and convert it to a CSV file (delimited with tabs)
The data looks something like this:


SET: S0 [10000] TTL: 5        PPN: 339722142                            PAG: 1 .

Ingevoerd: 1996:31-01-12 Gewijzigd: 1996:07-02-12 09:12:25 Status: 1996:31-01-12

0500 Aav
1100 168X
1200 y
1200 i
1500 /1ned/1fra/1lat
1700 /1nl
2275 000004 - b1 A on : b2 A2 re$c
4000 @Verhaal van een droom die een verstandig staatkundige gehadt heeft
4040 [c. 1688]
4043 !075566745!S.l.s.n. Place and name not stated
4060 A`SUP`2`LO`
4062 4<EA>
4201 `IT`Text in Dutch, French and Latin`LO`
4711 A (lm)
4900 31-01-12 11:39:52.000
6511 !155446010!History (Europe)
6512 !155445642!Period documents
7001 31-01-12 : s.a
7100 O 06-398 !A! @ i
7134 #Illustration#jpeg =A http://resolver.kb.nl/resolve?urn=stcn:339722142:01
7900 31-01-12 11:40:00.000
7800 575961856

'''

import sys
import csv

raw_data = open(sys.argv[1]).read()
data = {}
for chunk in raw_data.split('PPN: '):
    tmp = chunk.split('                            ')
    if len(tmp) != 2:
        continue
    ppn, chunk = tmp
    record = {}
    for line in chunk.split('\r\n'):
        tmp = line.split(' ')
        if len(tmp) < 2:
            continue
        code = tmp[0]
        content = ' '.join(tmp[1:])
        try:
            code = int(code)
            record.setdefault(code, []).append(content.strip('\r\n '))
        except ValueError:
            continue
    if record:
        data[ppn] = record

fields = set()
for x in data.values():
    for xx in x:
        fields.add(xx)

with open('%s.csv'%sys.argv[1], 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for x in data.values():
        writer.writerow([' '.join(x.get(f, [])) for f in sorted(fields)])
	'''
	Read in a STCN data dump file, and convert it to a CSV file (delimited with tabs)
	The data looks something like this:


	SET: S0 [10000] TTL: 5 PPN: 339722142 PAG: 1 .

	Ingevoerd: 1996:31-01-12 Gewijzigd: 1996:07-02-12 09:12:25 Status: 1996:31-01-12

	0500 Aav
	1100 168X
	1200 y
	1200 i
	1500 /1ned/1fra/1lat
	1700 /1nl
	2275 000004 - b1 A on : b2 A2 re$c
	4000 @Verhaal van een droom die een verstandig staatkundige gehadt heeft
	4040 [c. 1688]
	4043 !075566745!S.l.s.n. Place and name not stated
	4060 A`SUP`2`LO`
	4062 4<EA>
	4201 `IT`Text in Dutch, French and Latin`LO`
	4711 A (lm)
	4900 31-01-12 11:39:52.000
	6511 !155446010!History (Europe)
	6512 !155445642!Period documents
	7001 31-01-12 : s.a
	7100 O 06-398 !A! @ i
	7134 #Illustration#jpeg =A http://resolver.kb.nl/resolve?urn=stcn:339722142:01
	7900 31-01-12 11:40:00.000
	7800 575961856

	'''

	import sys
	import csv

	raw_data = open(sys.argv[1]).read()
	data = {}
	for chunk in raw_data.split('PPN: '):
	tmp = chunk.split(' ')
	if len(tmp) != 2:
	continue
	ppn, chunk = tmp
	record = {}
	for line in chunk.split('\r\n'):
	tmp = line.split(' ')
	if len(tmp) < 2:
	continue
	code = tmp[0]
	content = ' '.join(tmp[1:])
	try:
	code = int(code)
	record.setdefault(code, []).append(content.strip('\r\n '))
	except ValueError:
	continue
	if record:
	data[ppn] = record

	fields = set()
	for x in data.values():
	for xx in x:
	fields.add(xx)

	with open('%s.csv'%sys.argv[1], 'wb') as csvfile:
	writer = csv.writer(csvfile, delimiter='\t',
	quotechar='\|', quoting=csv.QUOTE_MINIMAL)
	for x in data.values():
	writer.writerow([' '.join(x.get(f, [])) for f in sorted(fields)])