policevideorequests/parse_opa_summaries.py

## parse_opa_summaries.py
import os
import re
os.system('rm *.pdf')
os.system('wget -nd -r -l 1 -A pdf http://www.seattle.gov/opa/closed-case-summaries')
files = sorted([f for f in os.listdir('.') if f.endswith('.pdf')])
for filename in files:
    os.system('pdf2txt.py %s > %s' % (filename, filename[:-4]+'.txt'))
files = sorted([f for f in os.listdir('.') if f.endswith('.txt')])
opa_files = []
for filename in files:
    f = open(filename, 'r')
    opa_file = f.read()
    opa_file_dict = {}
    print opa_file
    print filename
    try:
        opa_file_dict['Complaint number'] = re.search('Complaint Number(.*?)\n', opa_file).group(1).strip(' :')
        opa_file_dict['Issued date'] = re.search('Issued Date(.*?)\n', opa_file).group(1).strip(' :')
        # OPA Finding
        #
        #Not Sustained (Lawful & Proper)
        opa_file_dict['OPA finding'] = re.search('OPA Finding\s*(.*?)\n\n', opa_file).group(1).strip(' :')
        #Final Discipline
        #
        #N/A
        opa_file_dict['Final discipline'] = re.search('Final Discipline\s*(.*?)\n\n', opa_file).group(1).strip(' :')
        opa_file_dict['Incident synopsis'] = re.search('INCIDENT SYNOPSIS(.*?)(Complaint|COMPLAINT)', opa_file,  re.MULTILINE|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
        opa_file_dict['Complaint'] = re.search('COMPLAINT(.*?)(Complaint|INVESTIGATION)', opa_file,  re.MULTILINE|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
        opa_file_dict['Investigation'] = re.search('INVESTIGATION(.*?)(Complaint|ANALYSIS AND CONCLUSION)', opa_file,  re.MULTILINE|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
        opa_file_dict['Analysis and conclusion'] = re.search('ANALYSIS AND CONCLUSION(.*?)(Complaint|FINDINGS)', opa_file,  re.MULTILINE|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
        opa_file_dict['Findings'] = re.search('FINDINGS(.*?)NOTE', opa_file,  re.MULTILINE|re.DOTALL).group(1).strip(' :\n').replace('\n', '')

        #opa_file_dict['complaint'] = re.search('COMPLAINT\s*(.*?)\n\n\n', opa_file).group(1).strip(' :')
    except:
        continue
    opa_files.append(opa_file_dict)
    f.close()
#print opa_files


html = """
<title>Seattle Police OPA Closed Case Summaries</title>
<h1>OPA Closed Case Summaries</h1>
<p>This table is automatically compiled from the PDFs at <a href="http://www.seattle.gov/opa/closed-case-summaries">http://www.seattle.gov/opa/closed-case-summaries</a>. The source code is at </p><style>
@import url('static/css/reset.css');
body {
font:.7em Arial;
}
p {
margin:10px 10px 0 10px;}
table {
margin:10px;
border-collapse:collapse;
}
th,td {
border:1px solid #000;
padding:5px;
text-align:left;
vertical-align:top;
}
th {
font-weight:bold;
}
h1 {
background:#000;
color:#FFF;
padding:10px;
font-size:2em;
font-weight:bold;
}
h2 {
font-weight:bold;
font-size:1.5em;
}
h3 {
font-size:1.25em;
}
#main {
padding:10px;
}
p, table, ol, h2, h3, h4, h5 {
margin-bottom:10px;
}
ol {
padding-left:20px;
}
ol li {
list-style-type: decimal;
}
</style>
<table>"""
ordered_heading = ['Complaint number', 'Issued date', 'OPA finding', 'Final discipline', 'Incident synopsis', 'Complaint', 'Investigation', 'Analysis and conclusion', 'Findings']
html += "<tr>"
for header in ordered_heading:
    html += "<th>%s</th>" % (header)
html += "</tr>"
for opa_complaint in opa_files:
    html += "<tr>"
    for header in ordered_heading:
        html += "<td>%s</td>" % (opa_complaint[header])
    html += "</tr>"
html += "</tr>"
html += "</table>"
f = open('/var/www/html/opa_summaries.html', 'w')
f.write(html)
f.close()
	import os
	import re
	os.system('rm *.pdf')
	os.system('wget -nd -r -l 1 -A pdf http://www.seattle.gov/opa/closed-case-summaries')
	files = sorted([f for f in os.listdir('.') if f.endswith('.pdf')])
	for filename in files:
	os.system('pdf2txt.py %s > %s' % (filename, filename[:-4]+'.txt'))
	files = sorted([f for f in os.listdir('.') if f.endswith('.txt')])
	opa_files = []
	for filename in files:
	f = open(filename, 'r')
	opa_file = f.read()
	opa_file_dict = {}
	print opa_file
	print filename
	try:
	opa_file_dict['Complaint number'] = re.search('Complaint Number(.*?)\n', opa_file).group(1).strip(' :')
	opa_file_dict['Issued date'] = re.search('Issued Date(.*?)\n', opa_file).group(1).strip(' :')
	# OPA Finding
	#
	#Not Sustained (Lawful & Proper)
	opa_file_dict['OPA finding'] = re.search('OPA Finding\s(.?)\n\n', opa_file).group(1).strip(' :')
	#Final Discipline
	#
	#N/A
	opa_file_dict['Final discipline'] = re.search('Final Discipline\s(.?)\n\n', opa_file).group(1).strip(' :')
	opa_file_dict['Incident synopsis'] = re.search('INCIDENT SYNOPSIS(.*?)(Complaint\|COMPLAINT)', opa_file, re.MULTILINE\|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
	opa_file_dict['Complaint'] = re.search('COMPLAINT(.*?)(Complaint\|INVESTIGATION)', opa_file, re.MULTILINE\|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
	opa_file_dict['Investigation'] = re.search('INVESTIGATION(.*?)(Complaint\|ANALYSIS AND CONCLUSION)', opa_file, re.MULTILINE\|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
	opa_file_dict['Analysis and conclusion'] = re.search('ANALYSIS AND CONCLUSION(.*?)(Complaint\|FINDINGS)', opa_file, re.MULTILINE\|re.DOTALL).group(1).strip(' :\n').replace('\n', '')
	opa_file_dict['Findings'] = re.search('FINDINGS(.*?)NOTE', opa_file, re.MULTILINE\|re.DOTALL).group(1).strip(' :\n').replace('\n', '')

	#opa_file_dict['complaint'] = re.search('COMPLAINT\s(.?)\n\n\n', opa_file).group(1).strip(' :')
	except:
	continue
	opa_files.append(opa_file_dict)
	f.close()
	#print opa_files


	html = """
	<title>Seattle Police OPA Closed Case Summaries</title>
	<h1>OPA Closed Case Summaries</h1>
	<p>This table is automatically compiled from the PDFs at <a href="http://www.seattle.gov/opa/closed-case-summaries">http://www.seattle.gov/opa/closed-case-summaries</a>. The source code is at </p><style>
	@import url('static/css/reset.css');
	body {
	font:.7em Arial;
	}
	p {
	margin:10px 10px 0 10px;}
	table {
	margin:10px;
	border-collapse:collapse;
	}
	th,td {
	border:1px solid #000;
	padding:5px;
	text-align:left;
	vertical-align:top;
	}
	th {
	font-weight:bold;
	}
	h1 {
	background:#000;
	color:#FFF;
	padding:10px;
	font-size:2em;
	font-weight:bold;
	}
	h2 {
	font-weight:bold;
	font-size:1.5em;
	}
	h3 {
	font-size:1.25em;
	}
	#main {
	padding:10px;
	}
	p, table, ol, h2, h3, h4, h5 {
	margin-bottom:10px;
	}
	ol {
	padding-left:20px;
	}
	ol li {
	list-style-type: decimal;
	}
	</style>
	<table>"""
	ordered_heading = ['Complaint number', 'Issued date', 'OPA finding', 'Final discipline', 'Incident synopsis', 'Complaint', 'Investigation', 'Analysis and conclusion', 'Findings']
	html += "<tr>"
	for header in ordered_heading:
	html += "<th>%s</th>" % (header)
	html += "</tr>"
	for opa_complaint in opa_files:
	html += "<tr>"
	for header in ordered_heading:
	html += "<td>%s</td>" % (opa_complaint[header])
	html += "</tr>"
	html += "</tr>"
	html += "</table>"
	f = open('/var/www/html/opa_summaries.html', 'w')
	f.write(html)
	f.close()