Skip to content

Instantly share code, notes, and snippets.

@fitnr
Last active March 14, 2018 16:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fitnr/5380778 to your computer and use it in GitHub Desktop.
Save fitnr/5380778 to your computer and use it in GitHub Desktop.
The New York City Street Name Dictionary is in an insane fixed-width format. This python script parses it into a CSV.
from csv import writer
from collections import OrderedDict
'''
The New York City Street Name Dictionary is in an insane fixed-width format.
This python script parses it into a CSV.
http://www.nyc.gov/html/dcp/html/bytes/applbyte.shtml
'''
FIELDS_S = OrderedDict([
('_', 1),
('borocode', 1),
('geo_feature', 32),
('__', 15),
('numeric_i', 1),
('geotype', 1),
('len_fullname', 2),
('progen_num', 1),
('progen1_firstword', 1),
('progen1_geotype', 1),
# progenitor 1 B10SC
('progen1_b10sc', 8),
# ('progen1_borough', 1),
# ('progen1_streetcode', 5),
# ('progen1_localgroup', 2),
('progen1_spellvar', 3),
('progen1_horiz_typology_flag', 1),
('___', 2),
('progen2_firstword', 1),
('progen2_geotype', 1),
# progenitor 2 B10SC
('progen2_b10sc', 8),
# ('progen2_borough', 1),
# ('progen2_streetcode', 5),
# ('progen2_localgroup', 2),
('progen2_spellvar', 3),
('progen2_horiz_typology_flag', 11),
('____', 2)
])
FIELDS_NON_S = OrderedDict([
('_', 1),
('borocode', 1),
('geo_feature', 32),
('primary_street_i', 1),
('local_group_i', 1),
('b10sc', 8),
# ('borough', 1),
# ('streetcode', 5),
# ('localgroup', 2),
('spellvar', 3),
('__', 2),
('numeric_i', 1),
('geotype', 1),
('______', 2),
('_____', 32),
('___', 2),
('____', 20),
('horiz_typology_code', 1)
])
INDICATOR = 50
OUTFILE_S = 'parsed_cow_s.csv'
OUTFILE_NON_S = 'parsed_cow_non_s.csv'
COWFILE = 'snd13Acow.txt'
def parse_line(line, fields):
out, i = [], 0
for K, L in fields.items():
if K[0] == '_':
i += L
continue
out.append(line[i:i+L].strip())
i += L
return out
def write_file(filename, headers, writelist):
headers = [x for x in headers if x[0] != '_']
with open(filename, 'wb') as g:
w = writer(g)
w.writerow(headers)
for x in writelist:
w.writerow(x)
print 'wrote to', filename
def readcow(filename):
with open(filename, 'rb') as f:
# Skip first line of file info junk
f.readline()
lines = f.readlines()
return lines
def readlines(inputlines, indicator, fields_s, fields_non_s):
outlines_s, outlines_non_s = [], []
for line in inputlines:
if line[indicator] == 'S':
fields = fields_s
output = outlines_s
else:
fields = fields_non_s
output = outlines_non_s
output.append(parse_line(line, fields))
return outlines_s, outlines_non_s
def main():
lines = readcow(COWFILE)
lines_s, lines_non_s = readlines(lines, INDICATOR, FIELDS_S, FIELDS_NON_S)
write_file(OUTFILE_S, FIELDS_S.keys(), lines_s)
write_file(OUTFILE_NON_S, FIELDS_NON_S.keys(), lines_non_s)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment