Last active
July 6, 2016 00:50
-
-
Save Bachmann1234/3731ab2d7af6994833225687e0205059 to your computer and use it in GitHub Desktop.
Date/address extraction (Investigation of https://github.com/codeforboston/cornerwise/issues/210)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import json | |
import re | |
import os | |
def find_date(doc_text): | |
""" | |
This may be fragile. Though im not sure what part of this wont be fragile... | |
Basically look for the key phrase that suggests when the meeting will be and extract the date | |
""" | |
HIGHLIGHT_TEXT_END_EXPRESSION = re.compile(r'The City of Somerville Design Review Committee held a public meeting on (?P<date>.* p\.m\.| a\.m\.)') | |
return re.search(HIGHLIGHT_TEXT_END_EXPRESSION, doc_text).group('date') | |
def extract_address(re_match, doc_text): | |
""" | |
Grab the first non empty line in description. | |
This can be improved by validating text to see if its an address | |
""" | |
search_space = doc_text[0:re_match.start()] | |
non_empty_lines_in_search_space = [x for x in search_space.split('\n') if x] | |
# This replaces the time with empty string so we get just the address | |
return re.sub(r'\d+:\d+ (p\.m\.| a\.m\.)?', '', non_empty_lines_in_search_space[-1]).strip() | |
def extract_events(doc_text): | |
results = [] | |
description_regex = re.compile(r"Description: (?P<description>.*?)\.") | |
for x in description_regex.finditer(doc_text): | |
result = { | |
'description': x.group('description'), | |
'address': extract_address(x, doc_text) | |
} | |
results.append(result) | |
return results | |
def main(): | |
path = sys.argv[1] | |
if(os.path.isdir(path)): | |
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".txt")] | |
else: | |
files = [path] | |
for f in files: | |
with open(f, encoding="ISO-8859-9") as infile: | |
doc_text = infile.read() | |
date = find_date(doc_text) | |
descriptions = extract_events(doc_text) | |
print(json.dumps({"date": date, "events": descriptions}, sort_keys=True, indent=4, separators=(',', ': '))) | |
print('\n') | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/env python3 | |
import sys | |
import subprocess | |
def convert_pdf_to_txt(path): | |
cmd = 'pdftotext "{}"'.format(path) | |
output = subprocess.check_output(cmd, shell=True) | |
if __name__ == '__main__': | |
for arg in sys.argv[1:]: | |
if "pdf" in arg: | |
try: | |
convert_pdf_to_txt(arg) | |
except Exception as e: | |
print("failed to extract file {}".format(arg)) | |
else: | |
print("Skipping {} as it does not look like a pdf to me".format(arg)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"date": "Thursday, July 30, 2015, at 6:30 p.m.", | |
"events": [ | |
{ | |
"address": "231 Lowell Street", | |
"description": "Review of the site plan and design for 19 residential units in 3 structures" | |
} | |
] | |
} | |
{ | |
"date": "Thursday, September 24, 2015, at 6:30 p.m.", | |
"events": [ | |
{ | |
"address": "40 Medford Street", | |
"description": "The DRC reviewed revised materials for the proposed development at 40 Medford Street with respect to the landscape requirements, parking aisle, and primary fa\u00e7ade, as presented by the project architect" | |
} | |
] | |
} | |
{ | |
"date": "Thursday, October 15, 2015, at 6:45 p.m.", | |
"events": [ | |
{ | |
"address": "220 Washington Street", | |
"description": "The DRC reviewed materials for the proposed development at 220 Washington Street with respect to the fa\u00e7ade color, membrane roof system, HVAC equipment, and potential application of art to the fa\u00e7ade" | |
}, | |
{ | |
"address": "400-406 Mystic Avenue", | |
"description": "The DRC reviewed materials for the proposed development at 400-406 Mystic Avenue with respect to the general use, massing and style proposed, though this was difficult as the architect did not provide sufficient materials for a thorough review" | |
} | |
] | |
} | |
{ | |
"date": "Thursday, January 14, 2016, at 6:30 p.m.", | |
"events": [ | |
{ | |
"address": "400-406 Mystic Ave.", | |
"description": "New 20 unit residential building with ground floor retail and 35 parking spaces" | |
}, | |
{ | |
"address": "70 Prospect.", | |
"description": "Material review" | |
}, | |
{ | |
"address": "50 Middlesex Ave.", | |
"description": "Addition of awnings to existing structure" | |
}, | |
{ | |
"address": "ASQ Block 11", | |
"description": "Review of proposed Partners daycare facility and signage" | |
} | |
] | |
} | |
{ | |
"date": "Thursday, April 28, 2016, at 6:30 p.m.", | |
"events": [ | |
{ | |
"address": "1060 Broadway / Powder House School:", | |
"description": "Renovation of Powder House School Structure" | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment