Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save policevideorequests/e40ccf44e97f130de33b to your computer and use it in GitHub Desktop.
Save policevideorequests/e40ccf44e97f130de33b to your computer and use it in GitHub Desktop.
# created by Tim Clemans and is open sourced under GPL
import os
import re
from datetime import datetime
from datetime import date
def calculate_age(born):
today = date.today()
try:
birthday = born.replace(year=today.year)
except ValueError: # raised when birth date is February 29 and the current year is not a leap year
birthday = born.replace(year=today.year, month=born.month+1, day=1)
if birthday > today:
return today.year - born.year - 1
else:
return today.year - born.year
def grab_names(go_report_data):
#print go_report_data.items()
names = []
#print go_report_data.keys()
for heading in ['victims', 'witnesses', 'suspects']:
#print heading, type(go_report_data.get('heading'))
if isinstance(go_report_data.get(heading), list):
#print 'list'
for item in go_report_data.get(heading):
if isinstance(item, dict):
#print 'dict'
if item.get('firstname'):
names.append(item.get('firstname'))
names.append(item.get('firstname').lower().capitalize())
if item.get('lastname'):
names.append(item.get('lastname'))
names.append(item.get('lastname').lower().capitalize())
return names
def get_incident_type(go_report):
return go_report.split('\n')[7]
def get_go_report_number(go_report):
m = re.search('\d{4}-\d{6}', go_report)
if m:
return m.group()
else:
return None
def get_number_of_pages(go_report):
m = re.search('Page \d+ of (?P<pages>\d+)', go_report)
if m:
return int(m.group('pages'))
else:
return None
def get_number_of_words(go_report):
return len(go_report.split(' '))
def get_number_of_victims(go_report):
return len(re.findall('VICTIM # \d+', go_report) + re.findall('VICT/SUSPECT', go_report))
def get_number_of_witnesses(go_report):
return len(re.findall('WITNESS # \d+', go_report))
def get_number_of_arrested(go_report):
return len(re.findall('JUV \- ARRES # \d+', go_report))
def get_number_of_suspects(go_report):
return len(re.findall('VICT/SUSPECT', go_report))
def get_operational_status(go_report):
m = re.search('Operational status: (.*?)\n', go_report)
if m:
return m.group(1)
else:
return None
def get_narrative(go_report):
lines = [i.strip() for i in go_report.split('\n')]
narrative = ''
try:
narrative = lines[lines.index('15 INITIAL INCIDENT DESCRIPTION / NARRATIVE:')+1:lines.index('I hereby declare (certify) under penalty of perjury under the laws of the')]
narrative = [i for i in narrative if not i.startswith('Page')] # remove page numbers
narrative = [i for i in narrative if not i.startswith('For: ')] # remove For:
narrative = [i for i in narrative if not i.startswith('SEATTLE POLICE DEPARTMENT')] # remove
narrative = [i for i in narrative if not i.startswith('GENERAL OFFENSE HARDCOPY')] # remove
narrative = [i for i in narrative if not i.startswith('PUBLIC DISCLOSURE RELEASE COPY')] # remove
narrative = [i for i in narrative if not i.startswith('GO#')] # remove
narrative = [i for i in narrative if not i.strip().startswith('LAW DEPT BY FOLLOW-UP UNIT')] # LAW DEPT BY FOLLOW-UP UNIT
narrative = [i for i in narrative if not re.search('\d+\-\d', i)] # remove
narrative = [i.strip() for i in narrative if not i.startswith(']')] # remove
narrative = '\n'.join(narrative)
narrative = narrative.strip().replace('\n\n', '').strip('[]')
narrative = narrative.replace('\n', '<br/>')
#print repr(narrative)
except:
pass
return narrative
def convert_personal_info_to_dict(personal_info):
personal_data = {}
m = re.search('(?P<lastname>[A-Z]+), (?P<firstname>[A-Z]+) (?P<middleinital>[A-Z])', personal_info)
if m:
personal_data = m.groupdict()
else:
m = re.search('(?P<lastname>[A-Z]+), (?P<firstname>[A-Z]+)', personal_info)
if m:
personal_data = m.groupdict()
personal_data.update(dict([(i[0].strip().lower().replace(' ', '_'), i[1].strip()) for i in re.findall('(.*?):(.*?)\n', personal_info)]))
personal_data['type'] = re.search('(?P<type>[ A-Z\-/]+) # \d+', personal_info).group('type')
try:
personal_data['age'] = calculate_age(datetime.strptime(personal_data['date_of_birth'], '%b-%d-%Y').date())
except:
pass
return personal_data
def extract_personal_info(go_report):
lines = go_report.split('\n')
#print len(lines)
personal_info_start_stop = []
looking_for = 'start'
start = None
stop = None
for n, line in enumerate(lines):
if looking_for == 'start':
#print 'looking'
m = re.search('[ A-Z\-/]+ # \d+', line)
if m:
#print 'T'
looking_for = 'stop'
start = n
else:
if line.startswith('For:'):
stop = n
personal_info_start_stop.append((start, stop))
looking_for = 'start'
personal_info = [convert_personal_info_to_dict('\n'.join(lines[start:stop])) for start, stop in personal_info_start_stop]
data = {'witnesses': [], 'suspects': [], 'victims': [], 'victim_vehicles': [], 'stolens': []}
for info in personal_info:
info['type'] = info['type'].strip()
#print info.get('type')
if info.get('type') == 'WITNESS':
data['witnesses'].append(info)
elif info.get('type') == 'VICTIM':
data['victims'].append(info)
elif info.get('type') == 'VICTIM VEHICLE' or info.get('type') == 'STOLEN':
data['victim_vehicles'].append(info)
else:
data['suspects'].append(info)
#print personal_info_start_stop
return data
go_reports_data = []
go_reports = [f for f in os.listdir('emailswsb/pdfs') if f.endswith('.txt')]
print 'Were redacted: %s/%s' % (len([i for i in go_reports if ('Redacted' in i or i.startswith('r_'))]), len(go_reports))
for go_report in go_reports:
go_report_filename = go_report
#print go_report
f = open('emailswsb/pdfs/%s' % (go_report), 'r')
go_report = f.read()
# calculate ages
# Date of birth: Oct-22-1954
birthdays = re.findall('Date of birth: (?P<bd>[\w\d]+\-[\w\d]+\-[\w\d]+)', go_report)
birthdays = [datetime.strptime(bd, '%b-%d-%Y').date() for bd in birthdays]
ages = [calculate_age(bd) for bd in birthdays]
is_minor = [age < 18 for age in ages]
#print birthdays
#print ages
go_report_data = {}
questions = [i[0].strip() for i in re.findall('\n(.*?):(.*?)\n', go_report)]
questions_count = {}
for q in sorted(list(set(questions))):
questions_count[q] = questions.count(q)
go_report_data.update(dict([(i[0].strip().lower().replace(' ', '_'), i[1].strip()) for i in re.findall('\n(.*?):(.*?)\n', go_report) if questions_count[i[0].strip()] == 1 and not re.match('\d+(.*?)', i[0].strip())]))
parser_functions = sorted([variable for variable in globals().copy() if variable.startswith('get_')])
for parser_function in parser_functions:
go_report_data[parser_function[4:]] = globals()[parser_function](go_report)
go_report_data['is_redacted'] = True if (go_report_filename.startswith('r_') or 'Redacted' in go_report_filename) else False
go_report_data.update(extract_personal_info(go_report))
go_reports_data.append(go_report_data)
#print go_reports_data
go_reports_data_keys = []
for go_report_data in go_reports_data:
go_reports_data_keys.extend(go_report_data.keys())
headings = sorted(list(set(go_reports_data_keys)))
#print [variable for variable in globals().copy() if variable.startswith('get_')]
#print globals()['get_number_of_pages']
import csv
import sys
f = open('go_reports.csv', 'wt')
try:
writer = csv.writer(f)
writer.writerow( headings )
for go_report_data in go_reports_data:
writer.writerow( [go_report_data.get(heading) for heading in headings] )
finally:
f.close()
html = """
<h1>Demonstration of early days of converting Seattle Police Go reports to structured data</h1>
<h2>This is very very early days.</h2>
<p>The personal info isn't showing up in the right columns for example. There's a ton of work left to do. I haven't even begun writing parsers for the narratives.</p>
<style>
@import url('static/css/reset.css');
body {
font:.7em Arial;
}
table {
border-collapse:collapse;
}
th,td {
border:1px solid #000;
padding:5px;
text-align:left;
vertical-align:top;
}
th {
font-weight:bold;
}
h1 {
background:#000;
color:#FFF;
padding:10px;
font-size:2em;
font-weight:bold;
}
h2 {
font-weight:bold;
font-size:1.5em;
}
h3 {
font-size:1.25em;
}
#main {
padding:10px;
}
p, table, ol, h2, h3, h4, h5 {
margin-bottom:10px;
}
ol {
padding-left:20px;
}
ol li {
list-style-type: decimal;
}
</style>
<table>"""
ordered_heading = ['occurred_between', 'go_report_number','incident_type', 'narrative', 'victims', 'victim_vehicles', 'suspects', 'witnesses']
for go_report_data in go_reports_data:
html += "<tr>"
for header in ordered_heading+headings:
html += "<th>%s</th>" % (header)
html += "</tr>"
html += "<tr>"
for col in [go_report_data.get(heading) for heading in ordered_heading+headings]:
if isinstance(col, list):
html += "<td><table>"
for i, person in enumerate(col):
html += '<tr><th colspan="2">#%s</th></tr>' % (i)
for k in sorted(person.keys()):
html += '<tr><th>%s</th><td>%s</td></tr>' % (k, person[k])
html += "</table></td>"
else:
html += "<td>%s</td>" % (col)
html += "</tr>"
html += """
</table>
"""
f = open('/var/www/html/rawbigtable.html', 'w')
f.write(html)
f.close()
#print html
html = """
<h1>Demonstration of early days of converting Seattle Police Go reports to structured data</h1>
<h2>This is very very early days.</h2>
<p>The personal info isn't showing up in the right columns for example. There's a ton of work left to do. I haven't even begun writing parsers for the narratives.</p>
<style>
@import url('static/css/reset.css');
body {
font:.7em Arial;
}
table {
border-collapse:collapse;
}
th,td {
border:1px solid #000;
padding:5px;
text-align:left;
vertical-align:top;
}
th {
font-weight:bold;
}
h1 {
background:#000;
color:#FFF;
padding:10px;
font-size:2em;
font-weight:bold;
}
h2 {
font-weight:bold;
font-size:1.5em;
}
h3 {
font-size:1.25em;
}
#main {
padding:10px;
}
p, table, ol, h2, h3, h4, h5 {
margin-bottom:10px;
}
ol {
padding-left:20px;
}
ol li {
list-style-type: decimal;
}
</style>
<table>"""
ordered_heading = ['occurred_between', 'go_report_number','incident_type', 'narrative']
import nltk
from nltk.tag.stanford import NERTagger
st = NERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar')
for n, go_report_data in enumerate(go_reports_data):
print 'Number:', n
#print grab_names(go_report_data)
html += "<tr>"
for header in ordered_heading:
html += "<th>%s</th>" % (header)
html += "</tr>"
html += "<tr>"
for heading in ordered_heading:
col = go_report_data.get(heading)
if isinstance(col, list):
html += "<td><table>"
for i, person in enumerate(col):
html += '<tr><th colspan="2">#%s</th></tr>' % (i)
for k in sorted(person.keys()):
html += '<tr><th>%s</th><td>%s</td></tr>' % (k, person[k])
html += "</table></td>"
else:
if heading == 'narrative':
print grab_names(go_report_data)
for name in grab_names(go_report_data):
col = col.replace(name, '*redacted person/organization*')
#find names with dashes in them
names_with_dashes = re.findall('[A-Z][a-z]+-[A-Z][a-z]+', col)
for name_with_dash in names_with_dashes:
col = col.replace(name_with_dash, '*redacted person/organization*')
capitalized_words = sorted(list(set(re.findall('[A-Z][a-z]+', col))))
tags = st.tag(capitalized_words)
for tag in tags:
print tag
if tag[1]=='PERSON' or tag[1]=='ORGANIZATION':
col = col.replace(tag[0], '*redacted person/organization*')
print capitalized_words
text = col
#for sent in nltk.sent_tokenize(text):
# print 'working on', sent
# tokens = nltk.tokenize.word_tokenize(sent)
# tags = st.tag(tokens)
# for tag in tags:
# if tag[1]=='PERSON': print tag
#
# col = col.replace(tag[0], '*R*')
html += "<td>%s</td>" % (col)
else:
html += "<td>%s</td>" % (col)
html += "</tr>"
html += """
</table>
"""
f = open('/var/www/html/bigtable.html', 'w')
f.write(html)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment