Skip to content

Instantly share code, notes, and snippets.

@jeremywarrick
Last active September 7, 2020 13:08
Show Gist options
  • Save jeremywarrick/091e1a8731baa943efcea26e1ffbaf66 to your computer and use it in GitHub Desktop.
Save jeremywarrick/091e1a8731baa943efcea26e1ffbaf66 to your computer and use it in GitHub Desktop.
A simple UCR data parser

Uniform Crime Report Parser

The FBI's Uniform Crime Report (UCR) MASTER data is stored in a fixed-width format which is not a standard format. This parser allows for conversion to CSV (tabular) data to allow for easier analysis.

Prerequisites

  • Python3 (tested with 3.6-3.8) with Pandas in your environment

Usage

After creating BOTH a header.csv and record.csv to identify the field type, the parser can run.

The following example is for the Arrest (ASR) MASTER data-set with the sample headers and records CSVs in this GIST:

python parser.py --headercsv sample_header_columns.csv --detailcsv sample_detail_columns.csv --input 2018_ASR1MON_NATIONAL_MASTER_FILE.txt

Header and Record creation

Each UCR data set contains a .doc file or .pdf file that lists the structure of the included fixed-width txt source file. This parser expects you to create two CSVs that have UNIQUE names and the start and stop column location of the specific record being converted. Since 0-based indexes are being used, be sure to subtract 1 from the start location. The resulting 2 CSVs should be separated based on the record type, which is indicated in the documentation by 'HEADER' and 'DETAIL'. Review the attached PDF for an example and compare the PDF 'position' and 'description' with the values in the UCR zip.

import csv
import os
import sys
import logging
import argparse
import pandas as pd
import fileinput
LOGLEVEL = os.environ.get('LOGLEVEL', 'WARNING').upper()
logging.basicConfig(level=LOGLEVEL)
class RecordParser(object):
def __init__(self, header_csv, record_csv, **kwargs):
self.primary_list = []
self.header_data = {}
self.header_data_records = self._set_colspecs(header_csv)
self.detail_data = {}
self.detail_data_records = self._set_colspecs(record_csv)
self.last_header_row = 0
self.current_file = None
def _get_fixed_data(self, line, header):
item_data = {}
for col_name, (start_col, end_col) in header:
item_data[col_name] = line[start_col:end_col]
return item_data
def _set_colspecs(self, file_name):
name = []
colspecs = []
df = pd.read_csv(file_name)
for index, row in df.iterrows():
name.append(row[0])
colspecs.append((row[1], row[2]))
return list(zip(name, colspecs))
def _process_row(self, line):
if line[22:25] == "000":
if (self.last_header_row == (self.current_file.filelineno() - 1) and not self.current_file.isfirstline()):
logging.info(f"Found empty header at {self.current_file.filelineno()}")
self.primary_list.append(self.header_data.copy())
self.last_header_row = self.current_file.filelineno()
self.header_data.clear()
self.header_data = self._get_fixed_data(line, self.header_data_records)
logging.info(f"Header data set to: {self.header_data}")
return self.header_data
else:
line_object = {**self.header_data, **self._get_fixed_data(line, self.detail_data_records)}
self.primary_list.append(line_object)
logging.info(f"Line data set to: {line_object}")
return self.primary_list
def parse_file(self, file_name):
with fileinput.input(file_name) as file:
self.current_file = file
for line in file:
self._process_row(line)
logging.debug(f"Parsed line #{file.filelineno()} : {line}")
def write_csv(self, outfile="output.csv", newline=""):
with open(outfile, "w", newline=newline) as csv_file:
cols = sorted(set(key for key, value in (self.detail_data_records + self.header_data_records)))
writer = csv.DictWriter(csv_file,
fieldnames=cols,
restval='',
extrasaction='ignore')
logging.info("Writing output file.")
writer.writeheader()
writer.writerows(self.primary_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", help="Print detailed parsing information.",
action="store_true")
parser.add_argument("-l", "--headercsv", help="The path to the CSV file containing the header line record format.")
parser.add_argument("-d", "--detailcsv", help="The path to the CSV file containing the detail line record format.")
parser.add_argument("-i", "--input", help="The path to the TXT file containing the UCR data file to process.")
args = parser.parse_args()
if args.verbose:
LOGLEVEL = "DEBUG"
record = RecordParser(args.headercsv, args.detailcsv)
record.parse_file(args.input)
record.write_csv()
identifier 0 1
numeric_state_code 1 3
ori_code 3 10
group 10 12
division 12 13
year 13 15
metropolitan_statistical_area 15 18
adjustment 21 22
offense 22 25
card_1_indicator_adult_male 18 19
card_2_indicator_adult_female 19 20
card_3_indicator_juvenile 20 21
offense_age_male_under_10 40 49
offense_age_male_10_to_12 49 58
offense_age_male_13_to_14 58 67
offense_age_male_15 67 76
offense_age_male_16 76 85
offense_age_male_17 85 94
offense_age_male_18 94 103
offense_age_male_19 103 112
offense_age_male_20 112 121
offense_age_male_21 121 130
offense_age_male_22 130 139
offense_age_male_23 139 148
offense_age_male_24 148 157
offense_age_male_25_to_29 157 166
offense_age_male_30_to_34 166 175
offense_age_male_35_to_39 175 184
offense_age_male_40_to_44 184 193
offense_age_male_45_to_49 193 202
offense_age_male_50_to_54 202 211
offense_age_male_55_to_59 211 220
offense_age_male_60_to_64 220 229
offense_age_male_over_64 229 238
offense_age_female_under_10 238 247
offense_age_female_10_to_12 247 256
offense_age_female_13_to_14 256 265
offense_age_female_15 265 274
offense_age_female_16 274 283
offense_age_female_17 283 292
offense_age_female_18 292 301
offense_age_female_19 301 310
offense_age_female_20 310 319
offense_age_female_21 319 328
offense_age_female_22 328 337
offense_age_female_23 337 346
offense_age_female_24 346 355
offense_age_female_25_to_29 355 364
offense_age_female_30_to_34 364 373
offense_age_female_35_to_39 373 382
offense_age_female_40_to_44 382 391
offense_age_female_45_to_49 391 400
offense_age_female_50_to_54 400 409
offense_age_female_55_to_59 409 418
offense_age_female_60_to_64 418 427
offense_age_female_over_64 427 436
offense_race_juvenile_white 436 445
offense_race_juvenile_black 445 454
offense_race_juvenile_indian 454 463
offense_race_juvenile_asian 463 472
offense_race_juvenile_hispanic 472 481
offense_race_juvenile_non_hispanic 481 490
offense_race_adult_white 490 499
offense_race_adult_black 499 508
offense_race_adult_indian 508 517
offense_race_adult_asian 517 526
offense_race_adult_hispanic 526 535
offense_race_adult_non_hispanic 535 544
identifier 0 1
numeric_state_code 1 3
ori_code 3 10
group 10 12
division 12 13
year 13 15
metropolitan_statistical_area 15 18
adjustment 21 22
offense 22 25
header_suburban 18 19
header_report_indication 20 21
header_sequence_number 27 32
header_county 32 35
header_core_city 39 40
header_current_population 40 40
header_previous_populations_year_one_to_five 50 109
header_agency_name 110 135
header_state_name 135 141
header_previous_populations_year_six_to_thirteen 141 564
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment