Skip to content

Instantly share code, notes, and snippets.

@raprasad
Last active November 28, 2018 18:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raprasad/f7c373e14854edaacb9311e0bec8be5a to your computer and use it in GitHub Desktop.
Save raprasad/f7c373e14854edaacb9311e0bec8be5a to your computer and use it in GitHub Desktop.
Script to check that crime incidents exist
"""python3 script to check data files
For use with New Haven Crime Files on Dataverse:
- https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/18J4ZW
"""
import json
from os.path import isfile
import sys
# --------------------------------------------
# SET YOUR DATA DIRECTORY HERE CONTAINING these files:
# - 02_incident_types.json
# - monthly data files
# - e.g. "incidents_2006_03.json", etc
# --------------------------------------------
DATA_DIRECTORY = 'dataverse_files'
def msgt(m):
"""print stmt"""
print('-' * 40)
print(m)
print('-' * 40)
CODE_LOOKUP = {}
def get_incident_lookup():
"""Open the incidents file and return a dict
e.g. {1: "MURDER", 2: "MISCONDUCT WITH A M/V"}
"""
global CODE_LOOKUP
if CODE_LOOKUP:
return CODE_LOOKUP
fname = '%s/02_incident_types.json' % DATA_DIRECTORY
if not isfile(fname):
print('Incidents file not found: %s' % fname)
sys.exit(0)
incident_code_info = json.loads(open(fname, 'r').read())
CODE_LOOKUP = {}
for info in incident_code_info:
# --------------------------------------------
# map incident primary key to incident description
# --------------------------------------------
CODE_LOOKUP[info['pk']] = info['fields']['incident_description']
return CODE_LOOKUP
def check_file(yyyy, mm, cnt=None):
"""Read through a data file. Check 'fields.incident_type' against codes"""
monthly_data_file = '%s/incidents_%s_%s.json' % (DATA_DIRECTORY, yyyy, mm)
if cnt:
msgt('(%s) check file: %s' % (cnt, monthly_data_file))
else:
msgt('check file: %s' % monthly_data_file)
if not isfile(monthly_data_file):
print('file not found')
return
code_lookup = get_incident_lookup()
# --------------------------------------------
# Load monthly data file
# --------------------------------------------
monthly_data = json.loads(open(monthly_data_file, 'r').read())
cnt_found = 0
# --------------------------------------------
# Iterate through incidents, check that 'fields.incident_type'
# has a corresponding value in the 'codes' dict.
# --------------------------------------------
for crime_info in monthly_data:
# --------------------------------------------
# get the incident code from the crime data
# --------------------------------------------
crime_code = crime_info['fields']['incident_type']
if crime_code in code_lookup:
cnt_found += 1
#print('(id: %s) [%s] [%s]' % \
# (crime_info['pk'], crime_code, code_lookup[crime_code]))
# --------------------------------------------
# Print # of found codes vs total codes
# --------------------------------------------
print('%s/%s' % (cnt_found, len(monthly_data)))
if cnt_found == len(monthly_data):
print('Looks good!')
else:
print('Error???')
def check_data_files():
"""Iterate through data files and check for incident codes"""
fcnt = 0
for year in range(2006, 2015):
for mm in range(1, 13):
fcnt += 1
mm = str(mm).zfill(2)
check_file(year, mm, fcnt)
if __name__ == '__main__':
# Check single month
check_file(2006, '01')
# Check all months
check_data_files()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment