Created
July 7, 2012 13:00
-
-
Save jcrubino/3066398 to your computer and use it in GitHub Desktop.
SAPD Neighborhood Watch Db
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SAPD Neighborhood Watch Db | |
# Parses San Antonio Police Neighborhood Calls pdfs into json for dbs | |
# | |
# | |
# Dev Env: Ubuntu 12.04 | |
# Licensed under the GNU General Public License: http://www.gnu.org/licenses/gpl.html | |
# Requires xpdf utils available at the command line (not a python library (yet)) | |
# pymongo & Mongodb | |
# pdfs from http://www.sanantonio.gov/neighborhoodpolicecalls/policecalls.aspx | |
# | |
# GOAL: Auto Updating JSON RESTful Server of SAPD Neghborhood Calls | |
# Parse Current PDFs and Crawl Historical Info | |
# Create an opensource SAPD "Calls for Service" db to serve and protect the people with open data | |
# | |
# | |
# TO DO: Everything!! | |
# The location key could be parsed better, seperation of address and neighborhood | |
# Add lat,long to the location data | |
# create 1 function to updates local db: | |
# 1) Read SAPD Page for new unprocessed pdf | |
# 2) Process pdf load into mongodb | |
# | |
# | |
import os | |
import shutil | |
from random import randint | |
import subprocess as sub | |
import pymongo | |
cwd = os.getcwd() | |
# Checks for existance of {backups} directory | |
# creates one if does not exist | |
try: | |
if 'backups' not in os.listdir(cwd): | |
os.mkdir('backups') | |
except Exception, e: | |
print e | |
pass | |
conn = pymongo.Connection() | |
pdb = conn.local.pdb | |
insert = pdb.insert | |
def list_filter(list,item): | |
""" | |
note: this is a filter that includes | |
returns a list of elements filtered if item in element | |
Rename: list_filter_include() | |
exempli gratia | |
>>> list_filter(file_names_list, '.txt') | |
>>> ['a.txt', 'b.txt', 'c.txt'] | |
""" | |
return [ elem for elem in list if item in elem ] | |
def list_to_dict(list, func): | |
""" | |
takes a list; list elements are keys, applies a function | |
to create values. | |
exempli gratia | |
>>> list_to_dict(file_names_list,string_uppercase) | |
>>> {'a.txt': 'A.TXT', 'b.txt': 'B.TXT', 'c.txt': 'C.TXT'} | |
""" | |
book = {} | |
for item in list: | |
book[item] = func(item) | |
return book | |
def read_file(file_name): | |
""" | |
Returns the string from a opened file | |
""" | |
with open(file_name, 'r') as f: | |
doc = f.read() | |
return doc | |
def remove_chars(char_list, doc_string): | |
""" | |
Removes chars from string | |
""" | |
doc_lines = doc_string.split('\n') | |
for item in char_list: | |
doc_lines = [x.replace(item, '') for x in doc_lines] | |
return '\n'.join(doc_lines) | |
def list_filter_exclude(string, doc_lines): | |
""" | |
Removes strings from list of lines | |
exempli gratia | |
>>> doc_lines = ['hello','world','this is the end'] | |
>>> list_filter_exclude('this is the end', doc_lines) | |
>>> ['hello', 'world'] | |
""" | |
return [line for line in doc_lines if string not in line] | |
def count_white_space(string): | |
""" | |
exempli gratia | |
>>> count_white_space('hello world, this is text') | |
>>> ['hello',2,'world,', 1, 'this', 1, 'is', 1, 'text'] | |
""" | |
Array = [] | |
split_string = string.split(' ') | |
for elem in split_string: | |
if elem != '': | |
if split_string.index(elem) != 0: | |
try: | |
Array.append(n) | |
except IndexError: | |
n = 1 | |
Array.append(n) | |
Array.append(elem) | |
n = 0 | |
if elem == '': | |
try: | |
n +=1 | |
except NameError,e: | |
n = 1 | |
return Array | |
# converts parsed text to json | |
def format_data(clean_line): | |
data = clean_line.split() | |
Hash = {} | |
Hash['sapd_id'] = data[0].lower() | |
_date = [bit for bit in data if bit.count('/') == 2][0] | |
n = data.index(_date) | |
Hash['date'] = _date | |
Hash['time'] = data[n + 1] | |
Hash['location'] = ' '.join(data[n + 2:]).lower() | |
Hash['type'] = ' '.join(data[1:n]).lower() | |
return Hash | |
def get_files(file_type): | |
""" | |
exempli gratia | |
>>> get_files('.txt') | |
>>> [ 'a.txt', 'b.txt', 'c.txt'] | |
""" | |
return list_filter(os.listdir(os.getcwd()), file_type) | |
def convert_pdfs(): | |
""" | |
converts pdf in current working directory to text WITHOUT ARGUMENTS | |
moves pdf to folder named {/backups} | |
""" | |
pdf_list = get_files('.pdf') | |
for pdf in pdf_list: | |
converted_file = pdf[36:-4].lower()+'.txt' | |
sub.check_call(['pdftotext','-layout', pdf, converted_file]) | |
shutil.copy(pdf,cwd+'/backups') | |
os.remove(pdf) | |
def parse_logs(): | |
""" | |
parses output text from convert pdfs WITHOUT ARGUMENTS | |
loads json record into mongodb | |
""" | |
text_files_list = list_filter(os.listdir(os.getcwd()), '.txt') | |
file_book = list_to_dict(text_files_list, read_file) | |
for key,item in file_book.iteritems(): | |
sp1 = remove_chars(['\r','\x0c'], item).split('\n') | |
sp2 = list_filter_exclude("Incident",sp1) | |
sp2 = [line for line in sp2 if 'SAPD' in line] | |
data_doc = map(format_data, sp2) | |
load_data = map(insert, data_doc) # loads data into db | |
if __name__ == "__main__": | |
convert_pdfs() | |
parse_logs() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment