Last active
December 19, 2019 08:00
-
-
Save weirdestnerd/6c2e79addee16d1e392e3954cc9636f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import textract | |
import tabula | |
import PyPDF2 | |
import calendar | |
from functools import reduce | |
import pandas as pd | |
def get_from_url(url=''): | |
# make sure link ends with .pdf | |
if not url.endswith('.pdf'): | |
url += '.pdf' | |
# get current directory's path | |
folderpath = os.path.dirname(os.path.abspath('')) | |
# name pdf that's to be downloaded | |
filepath = os.path.join(folderpath, '_temp.pdf') | |
response = requests.get(url, stream=True) | |
if response.status_code != 200: | |
raise ValueError('URL not found') | |
# write content of the url locally to a pdf file | |
with open(filepath, 'wb') as pdf_file: | |
for chunk in response.iter_content(chunk_size=512): | |
pdf_file.write(chunk) | |
pdf_reader = None | |
try: | |
# create a PdfFileReader instance for the downloaded PDF | |
pdf_reader = PyPDF2.PdfFileReader(open(filepath, mode='rb')) | |
except Exception: | |
print('Problem parsing pdf.') | |
# delete the downloaded PDF | |
os.remove(filepath) | |
return pdf_reader | |
def text2list(text): | |
if text == '': return '' | |
# split text extract by new line | |
split_text = text.split('\n') | |
# remove unwanted values | |
split_text = split_text[:len(split_text)-15] | |
split_text = split_text[4:] | |
# holds the rows for each log found in the text extract | |
rows = list() | |
disposition_types = ['cleared by arrest', 'cleared by exceptional means', 'closed', 'inactive', | |
'other agency jurisdiction', 'mcao transmitted', 'mcao turn down', | |
'reported to institution', 'pending', 'unfounded'] | |
current_index = 8 | |
start_index = 0 | |
# find the next reported disposition in the extract, | |
# combine split address as one, | |
# add new row that contains the crime log | |
while(current_index < len(split_text)): | |
isDisp = split_text[current_index].lower() in disposition_types | |
# find index of disposition | |
while(not isDisp and current_index < len(split_text)): | |
isDisp = split_text[current_index].lower() in disposition_types | |
current_index += 1 | |
new_row = split_text[start_index:current_index] | |
# if address is split | |
if len(new_row) > 10: | |
# next 3 lines: leave values prior to address as is | |
# combine address as one | |
# append disposition | |
new_row = new_row[:8] + \ | |
[reduce(lambda add, val: add + val, new_row[8:-1])] + \ | |
[new_row[-1]] | |
start_index = current_index | |
current_index += 8 | |
rows.append(new_row) | |
return rows | |
months = list(map(lambda i: calendar.month_name[i], list(range(1, 13)))) | |
for year in range(2011, 2012): | |
year_df = pd.DataFrame() | |
for month in months: | |
try: | |
month_log = get_from_url('https://www.asu.edu/police/logs/{} {}.pdf'.format(month, year)) | |
except ValueError: | |
try: | |
month_log = get_from_url('https://www.asu.edu/police/logs/{}{}.pdf'.format(month, year)) | |
except ValueError: | |
print('PDF not retrievable for {} {}'.format(month, year)) | |
month_log = None | |
if not month_log: | |
continue | |
num_pages = month_log.getNumPages() | |
rows = list() | |
for page_num in range(num_pages): | |
try: | |
month_content = month_log.getPage(page_num).extractText() | |
rows += text2list(month_content) | |
except Exception: | |
print('Unable to extract content for page {} in {} {}'.format(page_num, month, year)) | |
month_df = pd.DataFrame(rows, | |
columns=['incident_id', 'date_reported', 'time_reported', 'occured_from_date', 'occured_from_time', 'occured_to_date', 'occured_to_time', 'description', 'location', 'disposition']) | |
year_df = year_df.append(month_df) | |
year_df.to_csv('{}.csv'.format(year), mode='w+') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Analysis on crime rate at Arizona State University available on Medium