Skip to content

Instantly share code, notes, and snippets.

@rossjones
Created January 20, 2017 15:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rossjones/c525f45bb0cdc935649da7cfcf9f82e7 to your computer and use it in GitHub Desktop.
Save rossjones/c525f45bb0cdc935649da7cfcf9f82e7 to your computer and use it in GitHub Desktop.
"""
Converts the VOA Summary non-domestic rates data from a not-quite-csv to a huge Json blob.
python convert.py > summary.json < data/uk-englandwales-ndr-2017-summaryvaluations-proposed-epoch-0001-baseline-csv.csv
"""
import io
import json
import sys
def generate_record(row, titles):
data = {}
pos = 0
for cell in row:
data[titles[pos]] = cell
pos += 1
return data
type01_titles = [
'assessment_reference', 'uarn', 'ba_code', 'firm_name',
'number_or_name', 'sub_street_3', 'sub_street_2', 'sub_street_1',
'street', 'town', 'postal_district', 'county', 'postcode',
'scheme_ref', 'primary_description', 'total_area', 'subtotal',
'total_value', 'adopted_rv', 'list_year', 'ba_name',
'ba_reference_number', 'vo_ref', 'from_date', 'to_date',
'scat_code_only', 'unit_of_measurement','unadjusted_price'
]
type02_titles = [ 'line', 'floor', 'description', 'area', 'price', 'value' ]
type03_titles = [ 'other_oa_description', ' size', 'price', 'value' ]
type04_titles = [ 'pm_value' ]
type05_titles = [ 'spaces', 'spaces_value', 'area', 'area_value', 'total' ]
type06_titles = [ 'description', 'percent' ]
type07_titles = ['total_before', 'total_adjustment']
processors = {
'01': ('details', type01_titles,),
'02': ('line_items', type02_titles,),
'03': ('additional', type03_titles),
'04': ('plant_machinery', type04_titles),
'05': ('carpark', type05_titles),
'06': ('adjustments', type06_titles),
'07': ('adjustment_totals', type07_titles),
}
def new_record():
return {
'line_items': [],
'additional': [],
'adjustments': [],
'plant_machinery': None,
'details': None,
'carpark': None,
'adjustment_totals': None
}
def process(row_feeder):
current_record = new_record()
for row in row_feeder:
split_row = row.strip().split('*')
id_field = split_row[0]
# Get the name of this row and titles to use
name, title = processors[id_field]
# If this is a 01 and we have data from
# processing then we should yield the
# record
if id_field == '01' and current_record.get('details'):
yield current_record
current_record = new_record()
data = generate_record(split_row[1:], title)
if id_field in ['02', '03', '06']:
current_record[name].append(data)
else:
current_record[name] = data
if current_record:
yield current_record
if __name__ == '__main__':
print("[", end="")
counter = 0
with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8-sig') as f:
for record in process(f):
if counter > 0:
print(",", end="")
blob = json.dumps(record)
print(blob, end="")
counter += 1
print("]")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment