weirdestnerd/asu_crime_get_data.py

## asu_crime_get_data.py
import os
import requests
import textract
import tabula
import PyPDF2
import calendar
from functools import reduce
import pandas as pd

def get_from_url(url=''):
	# make sure link ends with .pdf
	if not url.endswith('.pdf'):
			url += '.pdf'

	# get current directory's path
	folderpath = os.path.dirname(os.path.abspath(''))
	# name pdf that's to be downloaded
	filepath = os.path.join(folderpath, '_temp.pdf')

	response = requests.get(url, stream=True)

	if response.status_code != 200:
			raise ValueError('URL not found')

	# write content of the url locally to a pdf file
	with open(filepath, 'wb') as pdf_file:
			for chunk in response.iter_content(chunk_size=512):
					pdf_file.write(chunk)

	pdf_reader = None

	try:
			# create a PdfFileReader instance for the downloaded PDF
			pdf_reader = PyPDF2.PdfFileReader(open(filepath, mode='rb'))
	except Exception:
			print('Problem parsing pdf.')

	# delete the downloaded PDF
	os.remove(filepath)

	return pdf_reader

def text2list(text):
    if text == '': return ''

    # split text extract by new line
    split_text = text.split('\n')
    # remove unwanted values
    split_text = split_text[:len(split_text)-15]
    split_text = split_text[4:]

    # holds the rows for each log found in the text extract
    rows = list()

    disposition_types = ['cleared by arrest', 'cleared by exceptional means', 'closed', 'inactive',
                         'other agency jurisdiction', 'mcao transmitted', 'mcao turn down',
                         'reported to institution', 'pending', 'unfounded']

    current_index = 8
    start_index = 0

    # find the next reported disposition in the extract,
    # combine split address as one,
    # add new row that contains the crime log
    while(current_index < len(split_text)):
        isDisp = split_text[current_index].lower() in disposition_types

        # find index of disposition
        while(not isDisp and current_index < len(split_text)):
            isDisp = split_text[current_index].lower() in disposition_types
            current_index += 1

        new_row = split_text[start_index:current_index]
        # if address is split
        if len(new_row) > 10:
            # next 3 lines: leave values prior to address as is
            # combine address as one
            # append disposition
            new_row = new_row[:8] + \
            [reduce(lambda add, val: add + val, new_row[8:-1])] + \
            [new_row[-1]]

        start_index = current_index
        current_index += 8

        rows.append(new_row)

    return rows

months = list(map(lambda i: calendar.month_name[i], list(range(1, 13))))

for year in range(2011, 2012):
    year_df = pd.DataFrame()

    for month in months:
        try:
            month_log = get_from_url('https://www.asu.edu/police/logs/{} {}.pdf'.format(month, year))
        except ValueError:
            try:
                month_log = get_from_url('https://www.asu.edu/police/logs/{}{}.pdf'.format(month, year))
            except ValueError:
                print('PDF not retrievable for {} {}'.format(month, year))
                month_log = None

        if not month_log:
            continue

        num_pages = month_log.getNumPages()
        rows = list()
        for page_num in range(num_pages):
            try:
                month_content = month_log.getPage(page_num).extractText()
                rows += text2list(month_content)
            except Exception:
                print('Unable to extract content for page {} in {} {}'.format(page_num, month, year))

        month_df = pd.DataFrame(rows,
                                columns=['incident_id', 'date_reported', 'time_reported', 'occured_from_date', 'occured_from_time', 'occured_to_date', 'occured_to_time', 'description', 'location', 'disposition'])
        year_df = year_df.append(month_df)

    year_df.to_csv('{}.csv'.format(year), mode='w+')
	import os
	import requests
	import textract
	import tabula
	import PyPDF2
	import calendar
	from functools import reduce
	import pandas as pd

	def get_from_url(url=''):
	# make sure link ends with .pdf
	if not url.endswith('.pdf'):
	url += '.pdf'

	# get current directory's path
	folderpath = os.path.dirname(os.path.abspath(''))
	# name pdf that's to be downloaded
	filepath = os.path.join(folderpath, '_temp.pdf')

	response = requests.get(url, stream=True)

	if response.status_code != 200:
	raise ValueError('URL not found')

	# write content of the url locally to a pdf file
	with open(filepath, 'wb') as pdf_file:
	for chunk in response.iter_content(chunk_size=512):
	pdf_file.write(chunk)

	pdf_reader = None

	try:
	# create a PdfFileReader instance for the downloaded PDF
	pdf_reader = PyPDF2.PdfFileReader(open(filepath, mode='rb'))
	except Exception:
	print('Problem parsing pdf.')

	# delete the downloaded PDF
	os.remove(filepath)

	return pdf_reader

	def text2list(text):
	if text == '': return ''

	# split text extract by new line
	split_text = text.split('\n')
	# remove unwanted values
	split_text = split_text[:len(split_text)-15]
	split_text = split_text[4:]

	# holds the rows for each log found in the text extract
	rows = list()

	disposition_types = ['cleared by arrest', 'cleared by exceptional means', 'closed', 'inactive',
	'other agency jurisdiction', 'mcao transmitted', 'mcao turn down',
	'reported to institution', 'pending', 'unfounded']

	current_index = 8
	start_index = 0

	# find the next reported disposition in the extract,
	# combine split address as one,
	# add new row that contains the crime log
	while(current_index < len(split_text)):
	isDisp = split_text[current_index].lower() in disposition_types

	# find index of disposition
	while(not isDisp and current_index < len(split_text)):
	isDisp = split_text[current_index].lower() in disposition_types
	current_index += 1

	new_row = split_text[start_index:current_index]
	# if address is split
	if len(new_row) > 10:
	# next 3 lines: leave values prior to address as is
	# combine address as one
	# append disposition
	new_row = new_row[:8] + \
	[reduce(lambda add, val: add + val, new_row[8:-1])] + \
	[new_row[-1]]

	start_index = current_index
	current_index += 8

	rows.append(new_row)

	return rows

	months = list(map(lambda i: calendar.month_name[i], list(range(1, 13))))

	for year in range(2011, 2012):
	year_df = pd.DataFrame()

	for month in months:
	try:
	month_log = get_from_url('https://www.asu.edu/police/logs/{} {}.pdf'.format(month, year))
	except ValueError:
	try:
	month_log = get_from_url('https://www.asu.edu/police/logs/{}{}.pdf'.format(month, year))
	except ValueError:
	print('PDF not retrievable for {} {}'.format(month, year))
	month_log = None

	if not month_log:
	continue

	num_pages = month_log.getNumPages()
	rows = list()
	for page_num in range(num_pages):
	try:
	month_content = month_log.getPage(page_num).extractText()
	rows += text2list(month_content)
	except Exception:
	print('Unable to extract content for page {} in {} {}'.format(page_num, month, year))

	month_df = pd.DataFrame(rows,
	columns=['incident_id', 'date_reported', 'time_reported', 'occured_from_date', 'occured_from_time', 'occured_to_date', 'occured_to_time', 'description', 'location', 'disposition'])
	year_df = year_df.append(month_df)

	year_df.to_csv('{}.csv'.format(year), mode='w+')