Created
January 2, 2015 23:05
-
-
Save policevideorequests/e40ccf44e97f130de33b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# created by Tim Clemans and is open sourced under GPL | |
import os | |
import re | |
from datetime import datetime | |
from datetime import date | |
def calculate_age(born): | |
today = date.today() | |
try: | |
birthday = born.replace(year=today.year) | |
except ValueError: # raised when birth date is February 29 and the current year is not a leap year | |
birthday = born.replace(year=today.year, month=born.month+1, day=1) | |
if birthday > today: | |
return today.year - born.year - 1 | |
else: | |
return today.year - born.year | |
def grab_names(go_report_data): | |
#print go_report_data.items() | |
names = [] | |
#print go_report_data.keys() | |
for heading in ['victims', 'witnesses', 'suspects']: | |
#print heading, type(go_report_data.get('heading')) | |
if isinstance(go_report_data.get(heading), list): | |
#print 'list' | |
for item in go_report_data.get(heading): | |
if isinstance(item, dict): | |
#print 'dict' | |
if item.get('firstname'): | |
names.append(item.get('firstname')) | |
names.append(item.get('firstname').lower().capitalize()) | |
if item.get('lastname'): | |
names.append(item.get('lastname')) | |
names.append(item.get('lastname').lower().capitalize()) | |
return names | |
def get_incident_type(go_report): | |
return go_report.split('\n')[7] | |
def get_go_report_number(go_report): | |
m = re.search('\d{4}-\d{6}', go_report) | |
if m: | |
return m.group() | |
else: | |
return None | |
def get_number_of_pages(go_report): | |
m = re.search('Page \d+ of (?P<pages>\d+)', go_report) | |
if m: | |
return int(m.group('pages')) | |
else: | |
return None | |
def get_number_of_words(go_report): | |
return len(go_report.split(' ')) | |
def get_number_of_victims(go_report): | |
return len(re.findall('VICTIM # \d+', go_report) + re.findall('VICT/SUSPECT', go_report)) | |
def get_number_of_witnesses(go_report): | |
return len(re.findall('WITNESS # \d+', go_report)) | |
def get_number_of_arrested(go_report): | |
return len(re.findall('JUV \- ARRES # \d+', go_report)) | |
def get_number_of_suspects(go_report): | |
return len(re.findall('VICT/SUSPECT', go_report)) | |
def get_operational_status(go_report): | |
m = re.search('Operational status: (.*?)\n', go_report) | |
if m: | |
return m.group(1) | |
else: | |
return None | |
def get_narrative(go_report): | |
lines = [i.strip() for i in go_report.split('\n')] | |
narrative = '' | |
try: | |
narrative = lines[lines.index('15 INITIAL INCIDENT DESCRIPTION / NARRATIVE:')+1:lines.index('I hereby declare (certify) under penalty of perjury under the laws of the')] | |
narrative = [i for i in narrative if not i.startswith('Page')] # remove page numbers | |
narrative = [i for i in narrative if not i.startswith('For: ')] # remove For: | |
narrative = [i for i in narrative if not i.startswith('SEATTLE POLICE DEPARTMENT')] # remove | |
narrative = [i for i in narrative if not i.startswith('GENERAL OFFENSE HARDCOPY')] # remove | |
narrative = [i for i in narrative if not i.startswith('PUBLIC DISCLOSURE RELEASE COPY')] # remove | |
narrative = [i for i in narrative if not i.startswith('GO#')] # remove | |
narrative = [i for i in narrative if not i.strip().startswith('LAW DEPT BY FOLLOW-UP UNIT')] # LAW DEPT BY FOLLOW-UP UNIT | |
narrative = [i for i in narrative if not re.search('\d+\-\d', i)] # remove | |
narrative = [i.strip() for i in narrative if not i.startswith(']')] # remove | |
narrative = '\n'.join(narrative) | |
narrative = narrative.strip().replace('\n\n', '').strip('[]') | |
narrative = narrative.replace('\n', '<br/>') | |
#print repr(narrative) | |
except: | |
pass | |
return narrative | |
def convert_personal_info_to_dict(personal_info): | |
personal_data = {} | |
m = re.search('(?P<lastname>[A-Z]+), (?P<firstname>[A-Z]+) (?P<middleinital>[A-Z])', personal_info) | |
if m: | |
personal_data = m.groupdict() | |
else: | |
m = re.search('(?P<lastname>[A-Z]+), (?P<firstname>[A-Z]+)', personal_info) | |
if m: | |
personal_data = m.groupdict() | |
personal_data.update(dict([(i[0].strip().lower().replace(' ', '_'), i[1].strip()) for i in re.findall('(.*?):(.*?)\n', personal_info)])) | |
personal_data['type'] = re.search('(?P<type>[ A-Z\-/]+) # \d+', personal_info).group('type') | |
try: | |
personal_data['age'] = calculate_age(datetime.strptime(personal_data['date_of_birth'], '%b-%d-%Y').date()) | |
except: | |
pass | |
return personal_data | |
def extract_personal_info(go_report): | |
lines = go_report.split('\n') | |
#print len(lines) | |
personal_info_start_stop = [] | |
looking_for = 'start' | |
start = None | |
stop = None | |
for n, line in enumerate(lines): | |
if looking_for == 'start': | |
#print 'looking' | |
m = re.search('[ A-Z\-/]+ # \d+', line) | |
if m: | |
#print 'T' | |
looking_for = 'stop' | |
start = n | |
else: | |
if line.startswith('For:'): | |
stop = n | |
personal_info_start_stop.append((start, stop)) | |
looking_for = 'start' | |
personal_info = [convert_personal_info_to_dict('\n'.join(lines[start:stop])) for start, stop in personal_info_start_stop] | |
data = {'witnesses': [], 'suspects': [], 'victims': [], 'victim_vehicles': [], 'stolens': []} | |
for info in personal_info: | |
info['type'] = info['type'].strip() | |
#print info.get('type') | |
if info.get('type') == 'WITNESS': | |
data['witnesses'].append(info) | |
elif info.get('type') == 'VICTIM': | |
data['victims'].append(info) | |
elif info.get('type') == 'VICTIM VEHICLE' or info.get('type') == 'STOLEN': | |
data['victim_vehicles'].append(info) | |
else: | |
data['suspects'].append(info) | |
#print personal_info_start_stop | |
return data | |
go_reports_data = [] | |
go_reports = [f for f in os.listdir('emailswsb/pdfs') if f.endswith('.txt')] | |
print 'Were redacted: %s/%s' % (len([i for i in go_reports if ('Redacted' in i or i.startswith('r_'))]), len(go_reports)) | |
for go_report in go_reports: | |
go_report_filename = go_report | |
#print go_report | |
f = open('emailswsb/pdfs/%s' % (go_report), 'r') | |
go_report = f.read() | |
# calculate ages | |
# Date of birth: Oct-22-1954 | |
birthdays = re.findall('Date of birth: (?P<bd>[\w\d]+\-[\w\d]+\-[\w\d]+)', go_report) | |
birthdays = [datetime.strptime(bd, '%b-%d-%Y').date() for bd in birthdays] | |
ages = [calculate_age(bd) for bd in birthdays] | |
is_minor = [age < 18 for age in ages] | |
#print birthdays | |
#print ages | |
go_report_data = {} | |
questions = [i[0].strip() for i in re.findall('\n(.*?):(.*?)\n', go_report)] | |
questions_count = {} | |
for q in sorted(list(set(questions))): | |
questions_count[q] = questions.count(q) | |
go_report_data.update(dict([(i[0].strip().lower().replace(' ', '_'), i[1].strip()) for i in re.findall('\n(.*?):(.*?)\n', go_report) if questions_count[i[0].strip()] == 1 and not re.match('\d+(.*?)', i[0].strip())])) | |
parser_functions = sorted([variable for variable in globals().copy() if variable.startswith('get_')]) | |
for parser_function in parser_functions: | |
go_report_data[parser_function[4:]] = globals()[parser_function](go_report) | |
go_report_data['is_redacted'] = True if (go_report_filename.startswith('r_') or 'Redacted' in go_report_filename) else False | |
go_report_data.update(extract_personal_info(go_report)) | |
go_reports_data.append(go_report_data) | |
#print go_reports_data | |
go_reports_data_keys = [] | |
for go_report_data in go_reports_data: | |
go_reports_data_keys.extend(go_report_data.keys()) | |
headings = sorted(list(set(go_reports_data_keys))) | |
#print [variable for variable in globals().copy() if variable.startswith('get_')] | |
#print globals()['get_number_of_pages'] | |
import csv | |
import sys | |
f = open('go_reports.csv', 'wt') | |
try: | |
writer = csv.writer(f) | |
writer.writerow( headings ) | |
for go_report_data in go_reports_data: | |
writer.writerow( [go_report_data.get(heading) for heading in headings] ) | |
finally: | |
f.close() | |
html = """ | |
<h1>Demonstration of early days of converting Seattle Police Go reports to structured data</h1> | |
<h2>This is very very early days.</h2> | |
<p>The personal info isn't showing up in the right columns for example. There's a ton of work left to do. I haven't even begun writing parsers for the narratives.</p> | |
<style> | |
@import url('static/css/reset.css'); | |
body { | |
font:.7em Arial; | |
} | |
table { | |
border-collapse:collapse; | |
} | |
th,td { | |
border:1px solid #000; | |
padding:5px; | |
text-align:left; | |
vertical-align:top; | |
} | |
th { | |
font-weight:bold; | |
} | |
h1 { | |
background:#000; | |
color:#FFF; | |
padding:10px; | |
font-size:2em; | |
font-weight:bold; | |
} | |
h2 { | |
font-weight:bold; | |
font-size:1.5em; | |
} | |
h3 { | |
font-size:1.25em; | |
} | |
#main { | |
padding:10px; | |
} | |
p, table, ol, h2, h3, h4, h5 { | |
margin-bottom:10px; | |
} | |
ol { | |
padding-left:20px; | |
} | |
ol li { | |
list-style-type: decimal; | |
} | |
</style> | |
<table>""" | |
ordered_heading = ['occurred_between', 'go_report_number','incident_type', 'narrative', 'victims', 'victim_vehicles', 'suspects', 'witnesses'] | |
for go_report_data in go_reports_data: | |
html += "<tr>" | |
for header in ordered_heading+headings: | |
html += "<th>%s</th>" % (header) | |
html += "</tr>" | |
html += "<tr>" | |
for col in [go_report_data.get(heading) for heading in ordered_heading+headings]: | |
if isinstance(col, list): | |
html += "<td><table>" | |
for i, person in enumerate(col): | |
html += '<tr><th colspan="2">#%s</th></tr>' % (i) | |
for k in sorted(person.keys()): | |
html += '<tr><th>%s</th><td>%s</td></tr>' % (k, person[k]) | |
html += "</table></td>" | |
else: | |
html += "<td>%s</td>" % (col) | |
html += "</tr>" | |
html += """ | |
</table> | |
""" | |
f = open('/var/www/html/rawbigtable.html', 'w') | |
f.write(html) | |
f.close() | |
#print html | |
html = """ | |
<h1>Demonstration of early days of converting Seattle Police Go reports to structured data</h1> | |
<h2>This is very very early days.</h2> | |
<p>The personal info isn't showing up in the right columns for example. There's a ton of work left to do. I haven't even begun writing parsers for the narratives.</p> | |
<style> | |
@import url('static/css/reset.css'); | |
body { | |
font:.7em Arial; | |
} | |
table { | |
border-collapse:collapse; | |
} | |
th,td { | |
border:1px solid #000; | |
padding:5px; | |
text-align:left; | |
vertical-align:top; | |
} | |
th { | |
font-weight:bold; | |
} | |
h1 { | |
background:#000; | |
color:#FFF; | |
padding:10px; | |
font-size:2em; | |
font-weight:bold; | |
} | |
h2 { | |
font-weight:bold; | |
font-size:1.5em; | |
} | |
h3 { | |
font-size:1.25em; | |
} | |
#main { | |
padding:10px; | |
} | |
p, table, ol, h2, h3, h4, h5 { | |
margin-bottom:10px; | |
} | |
ol { | |
padding-left:20px; | |
} | |
ol li { | |
list-style-type: decimal; | |
} | |
</style> | |
<table>""" | |
ordered_heading = ['occurred_between', 'go_report_number','incident_type', 'narrative'] | |
import nltk | |
from nltk.tag.stanford import NERTagger | |
st = NERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar') | |
for n, go_report_data in enumerate(go_reports_data): | |
print 'Number:', n | |
#print grab_names(go_report_data) | |
html += "<tr>" | |
for header in ordered_heading: | |
html += "<th>%s</th>" % (header) | |
html += "</tr>" | |
html += "<tr>" | |
for heading in ordered_heading: | |
col = go_report_data.get(heading) | |
if isinstance(col, list): | |
html += "<td><table>" | |
for i, person in enumerate(col): | |
html += '<tr><th colspan="2">#%s</th></tr>' % (i) | |
for k in sorted(person.keys()): | |
html += '<tr><th>%s</th><td>%s</td></tr>' % (k, person[k]) | |
html += "</table></td>" | |
else: | |
if heading == 'narrative': | |
print grab_names(go_report_data) | |
for name in grab_names(go_report_data): | |
col = col.replace(name, '*redacted person/organization*') | |
#find names with dashes in them | |
names_with_dashes = re.findall('[A-Z][a-z]+-[A-Z][a-z]+', col) | |
for name_with_dash in names_with_dashes: | |
col = col.replace(name_with_dash, '*redacted person/organization*') | |
capitalized_words = sorted(list(set(re.findall('[A-Z][a-z]+', col)))) | |
tags = st.tag(capitalized_words) | |
for tag in tags: | |
print tag | |
if tag[1]=='PERSON' or tag[1]=='ORGANIZATION': | |
col = col.replace(tag[0], '*redacted person/organization*') | |
print capitalized_words | |
text = col | |
#for sent in nltk.sent_tokenize(text): | |
# print 'working on', sent | |
# tokens = nltk.tokenize.word_tokenize(sent) | |
# tags = st.tag(tokens) | |
# for tag in tags: | |
# if tag[1]=='PERSON': print tag | |
# | |
# col = col.replace(tag[0], '*R*') | |
html += "<td>%s</td>" % (col) | |
else: | |
html += "<td>%s</td>" % (col) | |
html += "</tr>" | |
html += """ | |
</table> | |
""" | |
f = open('/var/www/html/bigtable.html', 'w') | |
f.write(html) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment