Skip to content

Instantly share code, notes, and snippets.

@cds-amal
Created March 2, 2015 19:59
Show Gist options
  • Save cds-amal/c94c2e32be21f0a96bae to your computer and use it in GitHub Desktop.
Save cds-amal/c94c2e32be21f0a96bae to your computer and use it in GitHub Desktop.
csv to json script for city of record data -- this is a starting point
import os.path
import json
import csv
from tidylib import tidy_document
import html2text as h2t
dbfolder = '../CROL-PDF/Sample Database'
# ( filename, fieldtoclean, csvdelimiter)
files = \
[('procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv',
'AdditionalDescription', ','),
('procPublicationRequestDMSSPortal Oct-Dec 2014.csv', None, '|'),
('procPublicationRequest_Oct-Dec_2014_clean.csv',
'AdditionalDescription', ','),
('procPublicationRequest_pipes.csv', 'AdditionalDescription', '|')]
for fn, field, delimiter in files:
out = fn.replace('.csv', '.json') # ugly
fn = os.path.join(dbfolder, fn)
items = []
with open(fn) as csvfile:
reader = csv.DictReader(csvfile, delimiter=delimiter)
for row in reader:
for k, v in row.items():
if v:
# decode using a codepage that handles 0x91, 92, 93 etc
row[k] = v.decode('cp1250')
if field:
try:
doc, errors, = tidy_document(row[field])
row[field] = h2t.html2text(doc)
except Exception, e:
print 'oopsie'
print e
print row[field]
print
items.append(row)
with open(out, 'w') as outfile:
json.dump(items, outfile, indent=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment