jcrubino/sapd_parse.py

## sapd_parse.py
# SAPD Neighborhood Watch Db
# Parses San Antonio Police Neighborhood Calls pdfs into json for dbs
#
#
# Dev Env:  Ubuntu 12.04
# Licensed under the GNU General Public License: http://www.gnu.org/licenses/gpl.html
# Requires  xpdf utils available at the command line (not a python library (yet))
#           pymongo & Mongodb
#           pdfs from http://www.sanantonio.gov/neighborhoodpolicecalls/policecalls.aspx
#
# GOAL:  Auto Updating JSON RESTful Server of SAPD Neghborhood Calls
#        Parse Current PDFs and Crawl Historical Info
#        Create an opensource SAPD "Calls for Service" db to serve and protect the people with open data
#
#
# TO DO: Everything!!
#    The location key could be parsed better, seperation of address and neighborhood
#    Add lat,long to the location data
#    create 1 function to updates local db:
#        1) Read SAPD Page for new unprocessed pdf
#        2) Process pdf load into mongodb
#
#


import os
import shutil

from random import randint
import subprocess as sub
import pymongo

cwd = os.getcwd()


# Checks for existance of  {backups} directory
# creates one if does not exist

try:
    if 'backups' not in os.listdir(cwd):
        os.mkdir('backups')
except Exception, e:
    print e
    pass

conn = pymongo.Connection()
pdb = conn.local.pdb
insert = pdb.insert

def list_filter(list,item):
    """
    note: this is a filter that includes
    returns a list of elements filtered if item in element

    Rename: list_filter_include()

    exempli gratia
    >>> list_filter(file_names_list, '.txt')
    >>> ['a.txt', 'b.txt', 'c.txt']
    """

    return [ elem for elem in list if item in elem ]

def list_to_dict(list, func):
    """
    takes a list; list elements are keys, applies a function
    to create values.

    exempli gratia
    >>> list_to_dict(file_names_list,string_uppercase)
    >>> {'a.txt': 'A.TXT', 'b.txt': 'B.TXT', 'c.txt': 'C.TXT'}
    """
    book = {}
    for item in list:
        book[item] = func(item)
    return book

def read_file(file_name):
    """
    Returns the string from a opened file
    """
    with open(file_name, 'r') as f:
        doc = f.read()
    return doc

def remove_chars(char_list, doc_string):
    """
    Removes chars from string
    """

    doc_lines = doc_string.split('\n')

    for item in char_list:
        doc_lines = [x.replace(item, '') for x in doc_lines]

    return '\n'.join(doc_lines)


def list_filter_exclude(string, doc_lines):
    """
    Removes strings from list of lines
    exempli gratia
    >>> doc_lines = ['hello','world','this is the end']
    >>> list_filter_exclude('this is the end', doc_lines)
    >>> ['hello', 'world']
    """
    return [line for line in doc_lines if string not in line]


def count_white_space(string):
    """
    exempli gratia
    >>> count_white_space('hello  world, this is text')
    >>> ['hello',2,'world,', 1, 'this', 1, 'is', 1, 'text']
    """
    Array = []
    split_string = string.split(' ')

    for elem in split_string:
        if elem != '':
            if split_string.index(elem) != 0:
                try:
                    Array.append(n)
                except IndexError:
                    n = 1
                    Array.append(n)
            Array.append(elem)
            n = 0

        if elem == '':
            try:
                n +=1
            except NameError,e:
                n = 1
    return Array

# converts parsed text to json
def format_data(clean_line):
    data = clean_line.split()
    Hash = {}
    Hash['sapd_id'] = data[0].lower()
    _date = [bit for bit in data if bit.count('/') == 2][0]
    n = data.index(_date)
    Hash['date'] = _date
    Hash['time'] = data[n + 1]
    Hash['location'] = ' '.join(data[n + 2:]).lower()
    Hash['type'] = ' '.join(data[1:n]).lower()
    return Hash


def get_files(file_type):
    """
    exempli gratia
    >>> get_files('.txt')
    >>> [ 'a.txt', 'b.txt', 'c.txt']
    """
    return list_filter(os.listdir(os.getcwd()), file_type)


def convert_pdfs():
    """
    converts pdf in current working directory to text WITHOUT ARGUMENTS
    moves pdf to folder named {/backups}
    """
    pdf_list = get_files('.pdf')
    for pdf in pdf_list:
	converted_file = pdf[36:-4].lower()+'.txt'
        sub.check_call(['pdftotext','-layout', pdf, converted_file])
        shutil.copy(pdf,cwd+'/backups')
        os.remove(pdf)


def parse_logs():
    """
    parses output text from convert pdfs WITHOUT ARGUMENTS
    loads json record into mongodb
    """
    text_files_list = list_filter(os.listdir(os.getcwd()), '.txt')
    file_book = list_to_dict(text_files_list, read_file)

    for key,item in file_book.iteritems():
        sp1 = remove_chars(['\r','\x0c'], item).split('\n')
        sp2 = list_filter_exclude("Incident",sp1)
        sp2 = [line for line in sp2 if 'SAPD' in line]
        data_doc = map(format_data, sp2)
        load_data = map(insert, data_doc)  # loads data into db


if __name__ == "__main__":
    convert_pdfs()
    parse_logs()
	# SAPD Neighborhood Watch Db
	# Parses San Antonio Police Neighborhood Calls pdfs into json for dbs
	#
	#
	# Dev Env: Ubuntu 12.04
	# Licensed under the GNU General Public License: http://www.gnu.org/licenses/gpl.html
	# Requires xpdf utils available at the command line (not a python library (yet))
	# pymongo & Mongodb
	# pdfs from http://www.sanantonio.gov/neighborhoodpolicecalls/policecalls.aspx
	#
	# GOAL: Auto Updating JSON RESTful Server of SAPD Neghborhood Calls
	# Parse Current PDFs and Crawl Historical Info
	# Create an opensource SAPD "Calls for Service" db to serve and protect the people with open data
	#
	#
	# TO DO: Everything!!
	# The location key could be parsed better, seperation of address and neighborhood
	# Add lat,long to the location data
	# create 1 function to updates local db:
	# 1) Read SAPD Page for new unprocessed pdf
	# 2) Process pdf load into mongodb
	#
	#



	import os
	import shutil

	from random import randint
	import subprocess as sub
	import pymongo

	cwd = os.getcwd()


	# Checks for existance of {backups} directory
	# creates one if does not exist

	try:
	if 'backups' not in os.listdir(cwd):
	os.mkdir('backups')
	except Exception, e:
	print e
	pass

	conn = pymongo.Connection()
	pdb = conn.local.pdb
	insert = pdb.insert

	def list_filter(list,item):
	"""
	note: this is a filter that includes
	returns a list of elements filtered if item in element

	Rename: list_filter_include()

	exempli gratia
	>>> list_filter(file_names_list, '.txt')
	>>> ['a.txt', 'b.txt', 'c.txt']
	"""

	return [ elem for elem in list if item in elem ]

	def list_to_dict(list, func):
	"""
	takes a list; list elements are keys, applies a function
	to create values.

	exempli gratia
	>>> list_to_dict(file_names_list,string_uppercase)
	>>> {'a.txt': 'A.TXT', 'b.txt': 'B.TXT', 'c.txt': 'C.TXT'}
	"""
	book = {}
	for item in list:
	book[item] = func(item)
	return book

	def read_file(file_name):
	"""
	Returns the string from a opened file
	"""
	with open(file_name, 'r') as f:
	doc = f.read()
	return doc

	def remove_chars(char_list, doc_string):
	"""
	Removes chars from string
	"""

	doc_lines = doc_string.split('\n')

	for item in char_list:
	doc_lines = [x.replace(item, '') for x in doc_lines]

	return '\n'.join(doc_lines)


	def list_filter_exclude(string, doc_lines):
	"""
	Removes strings from list of lines
	exempli gratia
	>>> doc_lines = ['hello','world','this is the end']
	>>> list_filter_exclude('this is the end', doc_lines)
	>>> ['hello', 'world']
	"""
	return [line for line in doc_lines if string not in line]


	def count_white_space(string):
	"""
	exempli gratia
	>>> count_white_space('hello world, this is text')
	>>> ['hello',2,'world,', 1, 'this', 1, 'is', 1, 'text']
	"""
	Array = []
	split_string = string.split(' ')

	for elem in split_string:
	if elem != '':
	if split_string.index(elem) != 0:
	try:
	Array.append(n)
	except IndexError:
	n = 1
	Array.append(n)
	Array.append(elem)
	n = 0

	if elem == '':
	try:
	n +=1
	except NameError,e:
	n = 1
	return Array

	# converts parsed text to json
	def format_data(clean_line):
	data = clean_line.split()
	Hash = {}
	Hash['sapd_id'] = data[0].lower()
	_date = [bit for bit in data if bit.count('/') == 2][0]
	n = data.index(_date)
	Hash['date'] = _date
	Hash['time'] = data[n + 1]
	Hash['location'] = ' '.join(data[n + 2:]).lower()
	Hash['type'] = ' '.join(data[1:n]).lower()
	return Hash


	def get_files(file_type):
	"""
	exempli gratia
	>>> get_files('.txt')
	>>> [ 'a.txt', 'b.txt', 'c.txt']
	"""
	return list_filter(os.listdir(os.getcwd()), file_type)


	def convert_pdfs():
	"""
	converts pdf in current working directory to text WITHOUT ARGUMENTS
	moves pdf to folder named {/backups}
	"""
	pdf_list = get_files('.pdf')
	for pdf in pdf_list:
	converted_file = pdf[36:-4].lower()+'.txt'
	sub.check_call(['pdftotext','-layout', pdf, converted_file])
	shutil.copy(pdf,cwd+'/backups')
	os.remove(pdf)


	def parse_logs():
	"""
	parses output text from convert pdfs WITHOUT ARGUMENTS
	loads json record into mongodb
	"""
	text_files_list = list_filter(os.listdir(os.getcwd()), '.txt')
	file_book = list_to_dict(text_files_list, read_file)

	for key,item in file_book.iteritems():
	sp1 = remove_chars(['\r','\x0c'], item).split('\n')
	sp2 = list_filter_exclude("Incident",sp1)
	sp2 = [line for line in sp2 if 'SAPD' in line]
	data_doc = map(format_data, sp2)
	load_data = map(insert, data_doc) # loads data into db


	if __name__ == "__main__":
	convert_pdfs()
	parse_logs()