jeremywarrick/parser.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Uniform Crime Report Parser

The FBI's Uniform Crime Report (UCR) MASTER data is stored in a fixed-width format which is not a standard format.  This parser allows for conversion to CSV (tabular) data to allow for easier analysis.
Prerequisites


Python3 (tested with 3.6-3.8) with Pandas in your environment

Usage

After creating BOTH a header.csv and record.csv to identify the field type, the parser can run.
The following example is for the Arrest (ASR) MASTER data-set with the sample headers and records CSVs in this GIST:
python parser.py --headercsv sample_header_columns.csv --detailcsv sample_detail_columns.csv --input 2018_ASR1MON_NATIONAL_MASTER_FILE.txt

Header and Record creation

Each UCR data set contains a .doc file or .pdf file that lists the structure of the included fixed-width txt source file.  This parser expects you to create two CSVs that have UNIQUE names and the start and stop column location of the specific record being converted.  Since 0-based indexes are being used, be sure to subtract 1 from the start location.  The resulting 2 CSVs should be separated based on the record type, which is indicated in the documentation by 'HEADER' and 'DETAIL'.  Review the attached PDF for an example and compare the PDF 'position' and 'description' with the values in the UCR zip.

  
## parser.py
import csv
import os
import sys
import logging
import argparse
import pandas as pd
import fileinput

LOGLEVEL = os.environ.get('LOGLEVEL', 'WARNING').upper()
logging.basicConfig(level=LOGLEVEL)

class RecordParser(object):
    def __init__(self, header_csv, record_csv, **kwargs):
        self.primary_list = []
        self.header_data = {}
        self.header_data_records = self._set_colspecs(header_csv)
        self.detail_data = {}
        self.detail_data_records = self._set_colspecs(record_csv)
        self.last_header_row = 0
        self.current_file = None

    def _get_fixed_data(self, line, header):
        item_data = {}
        for col_name, (start_col, end_col) in header:
            item_data[col_name] = line[start_col:end_col]

        return item_data

    def _set_colspecs(self, file_name):
        name = []
        colspecs = []

        df = pd.read_csv(file_name)

        for index, row in df.iterrows():
            name.append(row[0])
            colspecs.append((row[1], row[2]))

        return list(zip(name, colspecs))

    def _process_row(self, line):
        if line[22:25] == "000":
            if (self.last_header_row == (self.current_file.filelineno() - 1) and not self.current_file.isfirstline()):
                logging.info(f"Found empty header at {self.current_file.filelineno()}")
                self.primary_list.append(self.header_data.copy())
            self.last_header_row = self.current_file.filelineno()
            self.header_data.clear()
            self.header_data = self._get_fixed_data(line, self.header_data_records)
            logging.info(f"Header data set to: {self.header_data}")
            return self.header_data
        else:
            line_object = {**self.header_data, **self._get_fixed_data(line, self.detail_data_records)}
            self.primary_list.append(line_object)
            logging.info(f"Line data set to: {line_object}")
            return self.primary_list

    def parse_file(self, file_name):
        with fileinput.input(file_name) as file:
            self.current_file = file
            for line in file:
                self._process_row(line)
                logging.debug(f"Parsed line #{file.filelineno()} : {line}")

    def write_csv(self, outfile="output.csv", newline=""):
        with open(outfile, "w", newline=newline) as csv_file:
            cols = sorted(set(key for key, value in (self.detail_data_records + self.header_data_records)))
            writer = csv.DictWriter(csv_file,
                            fieldnames=cols,
                            restval='',
                            extrasaction='ignore')

            logging.info("Writing output file.")
            writer.writeheader()
            writer.writerows(self.primary_list)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", help="Print detailed parsing information.",
                        action="store_true")
    parser.add_argument("-l", "--headercsv", help="The path to the CSV file containing the header line record format.")
    parser.add_argument("-d", "--detailcsv", help="The path to the CSV file containing the detail line record format.")
    parser.add_argument("-i", "--input", help="The path to the TXT file containing the UCR data file to process.")
    args = parser.parse_args()

    if args.verbose:
        LOGLEVEL = "DEBUG"

    record = RecordParser(args.headercsv, args.detailcsv)
    record.parse_file(args.input)
    record.write_csv()


## sample_detail_columns.csv

          
            identifier
            0
            1

            
              numeric_state_code
              1
              3

            
              ori_code
              3
              10

            
              group
              10
              12

            
              division
              12
              13

            
              year
              13
              15

            
              metropolitan_statistical_area
              15
              18

            
              adjustment
              21
              22

            
              offense
              22
              25

            
              card_1_indicator_adult_male
              18
              19

            
              card_2_indicator_adult_female
              19
              20

            
              card_3_indicator_juvenile
              20
              21

            
              offense_age_male_under_10
              40
              49

            
              offense_age_male_10_to_12
              49
              58

            
              offense_age_male_13_to_14
              58
              67

            
              offense_age_male_15
              67
              76

            
              offense_age_male_16
              76
              85

            
              offense_age_male_17
              85
              94

            
              offense_age_male_18
              94
              103

            
              offense_age_male_19
              103
              112

            
              offense_age_male_20
              112
              121

            
              offense_age_male_21
              121
              130

            
              offense_age_male_22
              130
              139

            
              offense_age_male_23
              139
              148

            
              offense_age_male_24
              148
              157

            
              offense_age_male_25_to_29
              157
              166

            
              offense_age_male_30_to_34
              166
              175

            
              offense_age_male_35_to_39
              175
              184

            
              offense_age_male_40_to_44
              184
              193

            
              offense_age_male_45_to_49
              193
              202

            
              offense_age_male_50_to_54
              202
              211

            
              offense_age_male_55_to_59
              211
              220

            
              offense_age_male_60_to_64
              220
              229

            
              offense_age_male_over_64
              229
              238

            
              offense_age_female_under_10
              238
              247

            
              offense_age_female_10_to_12
              247
              256

            
              offense_age_female_13_to_14
              256
              265

            
              offense_age_female_15
              265
              274

            
              offense_age_female_16
              274
              283

            
              offense_age_female_17
              283
              292

            
              offense_age_female_18
              292
              301

            
              offense_age_female_19
              301
              310

            
              offense_age_female_20
              310
              319

            
              offense_age_female_21
              319
              328

            
              offense_age_female_22
              328
              337

            
              offense_age_female_23
              337
              346

            
              offense_age_female_24
              346
              355

            
              offense_age_female_25_to_29
              355
              364

            
              offense_age_female_30_to_34
              364
              373

            
              offense_age_female_35_to_39
              373
              382

            
              offense_age_female_40_to_44
              382
              391

            
              offense_age_female_45_to_49
              391
              400

            
              offense_age_female_50_to_54
              400
              409

            
              offense_age_female_55_to_59
              409
              418

            
              offense_age_female_60_to_64
              418
              427

            
              offense_age_female_over_64
              427
              436

            
              offense_race_juvenile_white
              436
              445

            
              offense_race_juvenile_black
              445
              454

            
              offense_race_juvenile_indian
              454
              463

            
              offense_race_juvenile_asian
              463
              472

            
              offense_race_juvenile_hispanic
              472
              481

            
              offense_race_juvenile_non_hispanic
              481
              490

            
              offense_race_adult_white
              490
              499

            
              offense_race_adult_black
              499
              508

            
              offense_race_adult_indian
              508
              517

            
              offense_race_adult_asian
              517
              526

            
              offense_race_adult_hispanic
              526
              535

            
              offense_race_adult_non_hispanic
              535
              544

## sample_header_columns.csv

          
            identifier
            0
            1

            
              numeric_state_code
              1
              3

            
              ori_code
              3
              10

            
              group
              10
              12

            
              division
              12
              13

            
              year
              13
              15

            
              metropolitan_statistical_area
              15
              18

            
              adjustment
              21
              22

            
              offense
              22
              25

            
              header_suburban
              18
              19

            
              header_report_indication
              20
              21

            
              header_sequence_number
              27
              32

            
              header_county
              32
              35

            
              header_core_city
              39
              40

            
              header_current_population
              40
              40

            
              header_previous_populations_year_one_to_five
              50
              109

            
              header_agency_name
              110
              135

            
              header_state_name
              135
              141

            
              header_previous_populations_year_six_to_thirteen
              141
              564
	import csv
	import os
	import sys
	import logging
	import argparse
	import pandas as pd
	import fileinput

	LOGLEVEL = os.environ.get('LOGLEVEL', 'WARNING').upper()
	logging.basicConfig(level=LOGLEVEL)

	class RecordParser(object):
	def __init__(self, header_csv, record_csv, **kwargs):
	self.primary_list = []
	self.header_data = {}
	self.header_data_records = self._set_colspecs(header_csv)
	self.detail_data = {}
	self.detail_data_records = self._set_colspecs(record_csv)
	self.last_header_row = 0
	self.current_file = None

	def _get_fixed_data(self, line, header):
	item_data = {}
	for col_name, (start_col, end_col) in header:
	item_data[col_name] = line[start_col:end_col]

	return item_data

	def _set_colspecs(self, file_name):
	name = []
	colspecs = []

	df = pd.read_csv(file_name)

	for index, row in df.iterrows():
	name.append(row[0])
	colspecs.append((row[1], row[2]))

	return list(zip(name, colspecs))

	def _process_row(self, line):
	if line[22:25] == "000":
	if (self.last_header_row == (self.current_file.filelineno() - 1) and not self.current_file.isfirstline()):
	logging.info(f"Found empty header at {self.current_file.filelineno()}")
	self.primary_list.append(self.header_data.copy())
	self.last_header_row = self.current_file.filelineno()
	self.header_data.clear()
	self.header_data = self._get_fixed_data(line, self.header_data_records)
	logging.info(f"Header data set to: {self.header_data}")
	return self.header_data
	else:
	line_object = {self.header_data, self._get_fixed_data(line, self.detail_data_records)}
	self.primary_list.append(line_object)
	logging.info(f"Line data set to: {line_object}")
	return self.primary_list

	def parse_file(self, file_name):
	with fileinput.input(file_name) as file:
	self.current_file = file
	for line in file:
	self._process_row(line)
	logging.debug(f"Parsed line #{file.filelineno()} : {line}")

	def write_csv(self, outfile="output.csv", newline=""):
	with open(outfile, "w", newline=newline) as csv_file:
	cols = sorted(set(key for key, value in (self.detail_data_records + self.header_data_records)))
	writer = csv.DictWriter(csv_file,
	fieldnames=cols,
	restval='',
	extrasaction='ignore')

	logging.info("Writing output file.")
	writer.writeheader()
	writer.writerows(self.primary_list)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-v", "--verbose", help="Print detailed parsing information.",
	action="store_true")
	parser.add_argument("-l", "--headercsv", help="The path to the CSV file containing the header line record format.")
	parser.add_argument("-d", "--detailcsv", help="The path to the CSV file containing the detail line record format.")
	parser.add_argument("-i", "--input", help="The path to the TXT file containing the UCR data file to process.")
	args = parser.parse_args()

	if args.verbose:
	LOGLEVEL = "DEBUG"

	record = RecordParser(args.headercsv, args.detailcsv)
	record.parse_file(args.input)
	record.write_csv()
identifier	0	1
numeric_state_code	1	3
ori_code	3	10
group	10	12
division	12	13
year	13	15
metropolitan_statistical_area	15	18
adjustment	21	22
offense	22	25
card_1_indicator_adult_male	18	19
card_2_indicator_adult_female	19	20
card_3_indicator_juvenile	20	21
offense_age_male_under_10	40	49
offense_age_male_10_to_12	49	58
offense_age_male_13_to_14	58	67
offense_age_male_15	67	76
offense_age_male_16	76	85
offense_age_male_17	85	94
offense_age_male_18	94	103
offense_age_male_19	103	112
offense_age_male_20	112	121
offense_age_male_21	121	130
offense_age_male_22	130	139
offense_age_male_23	139	148
offense_age_male_24	148	157
offense_age_male_25_to_29	157	166
offense_age_male_30_to_34	166	175
offense_age_male_35_to_39	175	184
offense_age_male_40_to_44	184	193
offense_age_male_45_to_49	193	202
offense_age_male_50_to_54	202	211
offense_age_male_55_to_59	211	220
offense_age_male_60_to_64	220	229
offense_age_male_over_64	229	238
offense_age_female_under_10	238	247
offense_age_female_10_to_12	247	256
offense_age_female_13_to_14	256	265
offense_age_female_15	265	274
offense_age_female_16	274	283
offense_age_female_17	283	292
offense_age_female_18	292	301
offense_age_female_19	301	310
offense_age_female_20	310	319
offense_age_female_21	319	328
offense_age_female_22	328	337
offense_age_female_23	337	346
offense_age_female_24	346	355
offense_age_female_25_to_29	355	364
offense_age_female_30_to_34	364	373
offense_age_female_35_to_39	373	382
offense_age_female_40_to_44	382	391
offense_age_female_45_to_49	391	400
offense_age_female_50_to_54	400	409
offense_age_female_55_to_59	409	418
offense_age_female_60_to_64	418	427
offense_age_female_over_64	427	436
offense_race_juvenile_white	436	445
offense_race_juvenile_black	445	454
offense_race_juvenile_indian	454	463
offense_race_juvenile_asian	463	472
offense_race_juvenile_hispanic	472	481
offense_race_juvenile_non_hispanic	481	490
offense_race_adult_white	490	499
offense_race_adult_black	499	508
offense_race_adult_indian	508	517
offense_race_adult_asian	517	526
offense_race_adult_hispanic	526	535
offense_race_adult_non_hispanic	535	544