fitnr/snd_parse.py

## snd_parse.py
from csv import writer
from collections import OrderedDict
'''
    The New York City Street Name Dictionary is in an insane fixed-width format.
    This python script parses it into a CSV.
    http://www.nyc.gov/html/dcp/html/bytes/applbyte.shtml
'''

FIELDS_S = OrderedDict([
    ('_', 1),
    ('borocode', 1),
    ('geo_feature', 32),

    ('__', 15),
    ('numeric_i', 1),
    ('geotype', 1),
    ('len_fullname', 2),
    ('progen_num', 1),
    ('progen1_firstword', 1),
    ('progen1_geotype', 1),

    # progenitor 1 B10SC
    ('progen1_b10sc', 8),
    # ('progen1_borough', 1),
    # ('progen1_streetcode', 5),
    # ('progen1_localgroup', 2),
    ('progen1_spellvar', 3),

    ('progen1_horiz_typology_flag', 1),

    ('___', 2),
    ('progen2_firstword', 1),
    ('progen2_geotype', 1),

    # progenitor 2 B10SC
    ('progen2_b10sc', 8),
    # ('progen2_borough', 1),
    # ('progen2_streetcode', 5),
    # ('progen2_localgroup', 2),
    ('progen2_spellvar', 3),

    ('progen2_horiz_typology_flag', 11),
    ('____', 2)
])

FIELDS_NON_S = OrderedDict([
    ('_', 1),
    ('borocode', 1),
    ('geo_feature', 32),
    ('primary_street_i', 1),
    ('local_group_i', 1),

    ('b10sc', 8),
    # ('borough', 1),
    # ('streetcode', 5),
    # ('localgroup', 2),
    ('spellvar', 3),

    ('__', 2),

    ('numeric_i', 1),
    ('geotype', 1),
    ('______', 2),
    ('_____', 32),

    ('___', 2),
    ('____', 20),
    ('horiz_typology_code', 1)
])

INDICATOR = 50

OUTFILE_S = 'parsed_cow_s.csv'
OUTFILE_NON_S = 'parsed_cow_non_s.csv'

COWFILE = 'snd13Acow.txt'


def parse_line(line, fields):
    out, i = [], 0
    for K, L in fields.items():
        if K[0] == '_':
            i += L
            continue
        out.append(line[i:i+L].strip())
        i += L
    return out


def write_file(filename, headers, writelist):
    headers = [x for x in headers if x[0] != '_']
    with open(filename, 'wb') as g:
        w = writer(g)
        w.writerow(headers)
        for x in writelist:
            w.writerow(x)
    print 'wrote to', filename


def readcow(filename):
    with open(filename, 'rb') as f:
        # Skip first line of file info junk
        f.readline()
        lines = f.readlines()
    return lines


def readlines(inputlines, indicator, fields_s, fields_non_s):
    outlines_s, outlines_non_s = [], []

    for line in inputlines:
        if line[indicator] == 'S':
            fields = fields_s
            output = outlines_s
        else:
            fields = fields_non_s
            output = outlines_non_s

        output.append(parse_line(line, fields))
    return outlines_s, outlines_non_s


def main():
    lines = readcow(COWFILE)
    lines_s, lines_non_s = readlines(lines, INDICATOR, FIELDS_S, FIELDS_NON_S)

    write_file(OUTFILE_S, FIELDS_S.keys(), lines_s)
    write_file(OUTFILE_NON_S, FIELDS_NON_S.keys(), lines_non_s)

if __name__ == '__main__':
    main()
	from csv import writer
	from collections import OrderedDict
	'''
	The New York City Street Name Dictionary is in an insane fixed-width format.
	This python script parses it into a CSV.
	http://www.nyc.gov/html/dcp/html/bytes/applbyte.shtml
	'''

	FIELDS_S = OrderedDict([
	('_', 1),
	('borocode', 1),
	('geo_feature', 32),

	('__', 15),
	('numeric_i', 1),
	('geotype', 1),
	('len_fullname', 2),
	('progen_num', 1),
	('progen1_firstword', 1),
	('progen1_geotype', 1),

	# progenitor 1 B10SC
	('progen1_b10sc', 8),
	# ('progen1_borough', 1),
	# ('progen1_streetcode', 5),
	# ('progen1_localgroup', 2),
	('progen1_spellvar', 3),

	('progen1_horiz_typology_flag', 1),

	('___', 2),
	('progen2_firstword', 1),
	('progen2_geotype', 1),

	# progenitor 2 B10SC
	('progen2_b10sc', 8),
	# ('progen2_borough', 1),
	# ('progen2_streetcode', 5),
	# ('progen2_localgroup', 2),
	('progen2_spellvar', 3),

	('progen2_horiz_typology_flag', 11),
	('____', 2)
	])

	FIELDS_NON_S = OrderedDict([
	('_', 1),
	('borocode', 1),
	('geo_feature', 32),
	('primary_street_i', 1),
	('local_group_i', 1),

	('b10sc', 8),
	# ('borough', 1),
	# ('streetcode', 5),
	# ('localgroup', 2),
	('spellvar', 3),

	('__', 2),

	('numeric_i', 1),
	('geotype', 1),
	('______', 2),
	('_____', 32),

	('___', 2),
	('____', 20),
	('horiz_typology_code', 1)
	])

	INDICATOR = 50

	OUTFILE_S = 'parsed_cow_s.csv'
	OUTFILE_NON_S = 'parsed_cow_non_s.csv'

	COWFILE = 'snd13Acow.txt'


	def parse_line(line, fields):
	out, i = [], 0
	for K, L in fields.items():
	if K[0] == '_':
	i += L
	continue
	out.append(line[i:i+L].strip())
	i += L
	return out


	def write_file(filename, headers, writelist):
	headers = [x for x in headers if x[0] != '_']
	with open(filename, 'wb') as g:
	w = writer(g)
	w.writerow(headers)
	for x in writelist:
	w.writerow(x)
	print 'wrote to', filename


	def readcow(filename):
	with open(filename, 'rb') as f:
	# Skip first line of file info junk
	f.readline()
	lines = f.readlines()
	return lines


	def readlines(inputlines, indicator, fields_s, fields_non_s):
	outlines_s, outlines_non_s = [], []

	for line in inputlines:
	if line[indicator] == 'S':
	fields = fields_s
	output = outlines_s
	else:
	fields = fields_non_s
	output = outlines_non_s

	output.append(parse_line(line, fields))
	return outlines_s, outlines_non_s


	def main():
	lines = readcow(COWFILE)
	lines_s, lines_non_s = readlines(lines, INDICATOR, FIELDS_S, FIELDS_NON_S)

	write_file(OUTFILE_S, FIELDS_S.keys(), lines_s)
	write_file(OUTFILE_NON_S, FIELDS_NON_S.keys(), lines_non_s)

	if __name__ == '__main__':
	main()