Skip to content

Instantly share code, notes, and snippets.

@jcrubino
Created July 7, 2012 13:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcrubino/3066398 to your computer and use it in GitHub Desktop.
Save jcrubino/3066398 to your computer and use it in GitHub Desktop.
SAPD Neighborhood Watch Db
# SAPD Neighborhood Watch Db
# Parses San Antonio Police Neighborhood Calls pdfs into json for dbs
#
#
# Dev Env: Ubuntu 12.04
# Licensed under the GNU General Public License: http://www.gnu.org/licenses/gpl.html
# Requires xpdf utils available at the command line (not a python library (yet))
# pymongo & Mongodb
# pdfs from http://www.sanantonio.gov/neighborhoodpolicecalls/policecalls.aspx
#
# GOAL: Auto Updating JSON RESTful Server of SAPD Neghborhood Calls
# Parse Current PDFs and Crawl Historical Info
# Create an opensource SAPD "Calls for Service" db to serve and protect the people with open data
#
#
# TO DO: Everything!!
# The location key could be parsed better, seperation of address and neighborhood
# Add lat,long to the location data
# create 1 function to updates local db:
# 1) Read SAPD Page for new unprocessed pdf
# 2) Process pdf load into mongodb
#
#
import os
import shutil
from random import randint
import subprocess as sub
import pymongo
cwd = os.getcwd()
# Checks for existance of {backups} directory
# creates one if does not exist
try:
if 'backups' not in os.listdir(cwd):
os.mkdir('backups')
except Exception, e:
print e
pass
conn = pymongo.Connection()
pdb = conn.local.pdb
insert = pdb.insert
def list_filter(list,item):
"""
note: this is a filter that includes
returns a list of elements filtered if item in element
Rename: list_filter_include()
exempli gratia
>>> list_filter(file_names_list, '.txt')
>>> ['a.txt', 'b.txt', 'c.txt']
"""
return [ elem for elem in list if item in elem ]
def list_to_dict(list, func):
"""
takes a list; list elements are keys, applies a function
to create values.
exempli gratia
>>> list_to_dict(file_names_list,string_uppercase)
>>> {'a.txt': 'A.TXT', 'b.txt': 'B.TXT', 'c.txt': 'C.TXT'}
"""
book = {}
for item in list:
book[item] = func(item)
return book
def read_file(file_name):
"""
Returns the string from a opened file
"""
with open(file_name, 'r') as f:
doc = f.read()
return doc
def remove_chars(char_list, doc_string):
"""
Removes chars from string
"""
doc_lines = doc_string.split('\n')
for item in char_list:
doc_lines = [x.replace(item, '') for x in doc_lines]
return '\n'.join(doc_lines)
def list_filter_exclude(string, doc_lines):
"""
Removes strings from list of lines
exempli gratia
>>> doc_lines = ['hello','world','this is the end']
>>> list_filter_exclude('this is the end', doc_lines)
>>> ['hello', 'world']
"""
return [line for line in doc_lines if string not in line]
def count_white_space(string):
"""
exempli gratia
>>> count_white_space('hello world, this is text')
>>> ['hello',2,'world,', 1, 'this', 1, 'is', 1, 'text']
"""
Array = []
split_string = string.split(' ')
for elem in split_string:
if elem != '':
if split_string.index(elem) != 0:
try:
Array.append(n)
except IndexError:
n = 1
Array.append(n)
Array.append(elem)
n = 0
if elem == '':
try:
n +=1
except NameError,e:
n = 1
return Array
# converts parsed text to json
def format_data(clean_line):
data = clean_line.split()
Hash = {}
Hash['sapd_id'] = data[0].lower()
_date = [bit for bit in data if bit.count('/') == 2][0]
n = data.index(_date)
Hash['date'] = _date
Hash['time'] = data[n + 1]
Hash['location'] = ' '.join(data[n + 2:]).lower()
Hash['type'] = ' '.join(data[1:n]).lower()
return Hash
def get_files(file_type):
"""
exempli gratia
>>> get_files('.txt')
>>> [ 'a.txt', 'b.txt', 'c.txt']
"""
return list_filter(os.listdir(os.getcwd()), file_type)
def convert_pdfs():
"""
converts pdf in current working directory to text WITHOUT ARGUMENTS
moves pdf to folder named {/backups}
"""
pdf_list = get_files('.pdf')
for pdf in pdf_list:
converted_file = pdf[36:-4].lower()+'.txt'
sub.check_call(['pdftotext','-layout', pdf, converted_file])
shutil.copy(pdf,cwd+'/backups')
os.remove(pdf)
def parse_logs():
"""
parses output text from convert pdfs WITHOUT ARGUMENTS
loads json record into mongodb
"""
text_files_list = list_filter(os.listdir(os.getcwd()), '.txt')
file_book = list_to_dict(text_files_list, read_file)
for key,item in file_book.iteritems():
sp1 = remove_chars(['\r','\x0c'], item).split('\n')
sp2 = list_filter_exclude("Incident",sp1)
sp2 = [line for line in sp2 if 'SAPD' in line]
data_doc = map(format_data, sp2)
load_data = map(insert, data_doc) # loads data into db
if __name__ == "__main__":
convert_pdfs()
parse_logs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment