Skip to content

Instantly share code, notes, and snippets.

@Bachmann1234
Last active July 6, 2016 00:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Bachmann1234/3731ab2d7af6994833225687e0205059 to your computer and use it in GitHub Desktop.
Save Bachmann1234/3731ab2d7af6994833225687e0205059 to your computer and use it in GitHub Desktop.
Date/address extraction (Investigation of https://github.com/codeforboston/cornerwise/issues/210)
#!/usr/bin/env python3
import sys
import json
import re
import os
def find_date(doc_text):
"""
This may be fragile. Though im not sure what part of this wont be fragile...
Basically look for the key phrase that suggests when the meeting will be and extract the date
"""
HIGHLIGHT_TEXT_END_EXPRESSION = re.compile(r'The City of Somerville Design Review Committee held a public meeting on (?P<date>.* p\.m\.| a\.m\.)')
return re.search(HIGHLIGHT_TEXT_END_EXPRESSION, doc_text).group('date')
def extract_address(re_match, doc_text):
"""
Grab the first non empty line in description.
This can be improved by validating text to see if its an address
"""
search_space = doc_text[0:re_match.start()]
non_empty_lines_in_search_space = [x for x in search_space.split('\n') if x]
# This replaces the time with empty string so we get just the address
return re.sub(r'\d+:\d+ (p\.m\.| a\.m\.)?', '', non_empty_lines_in_search_space[-1]).strip()
def extract_events(doc_text):
results = []
description_regex = re.compile(r"Description: (?P<description>.*?)\.")
for x in description_regex.finditer(doc_text):
result = {
'description': x.group('description'),
'address': extract_address(x, doc_text)
}
results.append(result)
return results
def main():
path = sys.argv[1]
if(os.path.isdir(path)):
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".txt")]
else:
files = [path]
for f in files:
with open(f, encoding="ISO-8859-9") as infile:
doc_text = infile.read()
date = find_date(doc_text)
descriptions = extract_events(doc_text)
print(json.dumps({"date": date, "events": descriptions}, sort_keys=True, indent=4, separators=(',', ': ')))
print('\n')
if __name__ == '__main__':
main()
#/usr/bin/env python3
import sys
import subprocess
def convert_pdf_to_txt(path):
cmd = 'pdftotext "{}"'.format(path)
output = subprocess.check_output(cmd, shell=True)
if __name__ == '__main__':
for arg in sys.argv[1:]:
if "pdf" in arg:
try:
convert_pdf_to_txt(arg)
except Exception as e:
print("failed to extract file {}".format(arg))
else:
print("Skipping {} as it does not look like a pdf to me".format(arg))
{
"date": "Thursday, July 30, 2015, at 6:30 p.m.",
"events": [
{
"address": "231 Lowell Street",
"description": "Review of the site plan and design for 19 residential units in 3 structures"
}
]
}
{
"date": "Thursday, September 24, 2015, at 6:30 p.m.",
"events": [
{
"address": "40 Medford Street",
"description": "The DRC reviewed revised materials for the proposed development at 40 Medford Street with respect to the landscape requirements, parking aisle, and primary fa\u00e7ade, as presented by the project architect"
}
]
}
{
"date": "Thursday, October 15, 2015, at 6:45 p.m.",
"events": [
{
"address": "220 Washington Street",
"description": "The DRC reviewed materials for the proposed development at 220 Washington Street with respect to the fa\u00e7ade color, membrane roof system, HVAC equipment, and potential application of art to the fa\u00e7ade"
},
{
"address": "400-406 Mystic Avenue",
"description": "The DRC reviewed materials for the proposed development at 400-406 Mystic Avenue with respect to the general use, massing and style proposed, though this was difficult as the architect did not provide sufficient materials for a thorough review"
}
]
}
{
"date": "Thursday, January 14, 2016, at 6:30 p.m.",
"events": [
{
"address": "400-406 Mystic Ave.",
"description": "New 20 unit residential building with ground floor retail and 35 parking spaces"
},
{
"address": "70 Prospect.",
"description": "Material review"
},
{
"address": "50 Middlesex Ave.",
"description": "Addition of awnings to existing structure"
},
{
"address": "ASQ Block 11",
"description": "Review of proposed Partners daycare facility and signage"
}
]
}
{
"date": "Thursday, April 28, 2016, at 6:30 p.m.",
"events": [
{
"address": "1060 Broadway / Powder House School:",
"description": "Renovation of Powder House School Structure"
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment