Skip to content

Instantly share code, notes, and snippets.

@weirdestnerd
Last active December 19, 2019 08:00
Show Gist options
  • Save weirdestnerd/6c2e79addee16d1e392e3954cc9636f4 to your computer and use it in GitHub Desktop.
Save weirdestnerd/6c2e79addee16d1e392e3954cc9636f4 to your computer and use it in GitHub Desktop.
import os
import requests
import textract
import tabula
import PyPDF2
import calendar
from functools import reduce
import pandas as pd
def get_from_url(url=''):
# make sure link ends with .pdf
if not url.endswith('.pdf'):
url += '.pdf'
# get current directory's path
folderpath = os.path.dirname(os.path.abspath(''))
# name pdf that's to be downloaded
filepath = os.path.join(folderpath, '_temp.pdf')
response = requests.get(url, stream=True)
if response.status_code != 200:
raise ValueError('URL not found')
# write content of the url locally to a pdf file
with open(filepath, 'wb') as pdf_file:
for chunk in response.iter_content(chunk_size=512):
pdf_file.write(chunk)
pdf_reader = None
try:
# create a PdfFileReader instance for the downloaded PDF
pdf_reader = PyPDF2.PdfFileReader(open(filepath, mode='rb'))
except Exception:
print('Problem parsing pdf.')
# delete the downloaded PDF
os.remove(filepath)
return pdf_reader
def text2list(text):
if text == '': return ''
# split text extract by new line
split_text = text.split('\n')
# remove unwanted values
split_text = split_text[:len(split_text)-15]
split_text = split_text[4:]
# holds the rows for each log found in the text extract
rows = list()
disposition_types = ['cleared by arrest', 'cleared by exceptional means', 'closed', 'inactive',
'other agency jurisdiction', 'mcao transmitted', 'mcao turn down',
'reported to institution', 'pending', 'unfounded']
current_index = 8
start_index = 0
# find the next reported disposition in the extract,
# combine split address as one,
# add new row that contains the crime log
while(current_index < len(split_text)):
isDisp = split_text[current_index].lower() in disposition_types
# find index of disposition
while(not isDisp and current_index < len(split_text)):
isDisp = split_text[current_index].lower() in disposition_types
current_index += 1
new_row = split_text[start_index:current_index]
# if address is split
if len(new_row) > 10:
# next 3 lines: leave values prior to address as is
# combine address as one
# append disposition
new_row = new_row[:8] + \
[reduce(lambda add, val: add + val, new_row[8:-1])] + \
[new_row[-1]]
start_index = current_index
current_index += 8
rows.append(new_row)
return rows
months = list(map(lambda i: calendar.month_name[i], list(range(1, 13))))
for year in range(2011, 2012):
year_df = pd.DataFrame()
for month in months:
try:
month_log = get_from_url('https://www.asu.edu/police/logs/{} {}.pdf'.format(month, year))
except ValueError:
try:
month_log = get_from_url('https://www.asu.edu/police/logs/{}{}.pdf'.format(month, year))
except ValueError:
print('PDF not retrievable for {} {}'.format(month, year))
month_log = None
if not month_log:
continue
num_pages = month_log.getNumPages()
rows = list()
for page_num in range(num_pages):
try:
month_content = month_log.getPage(page_num).extractText()
rows += text2list(month_content)
except Exception:
print('Unable to extract content for page {} in {} {}'.format(page_num, month, year))
month_df = pd.DataFrame(rows,
columns=['incident_id', 'date_reported', 'time_reported', 'occured_from_date', 'occured_from_time', 'occured_to_date', 'occured_to_time', 'description', 'location', 'disposition'])
year_df = year_df.append(month_df)
year_df.to_csv('{}.csv'.format(year), mode='w+')
@weirdestnerd
Copy link
Author

Analysis on crime rate at Arizona State University available on Medium

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment