Created
April 25, 2014 12:49
-
-
Save br0nstein/11288492 to your computer and use it in GitHub Desktop.
Generates a JSON file of all location names in each text file in a given directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import json | |
import ner # pyner | |
def locations_tag(directory): | |
""" | |
Finds location terms in all text files in a given directory | |
Input: | |
directory - string representing the local directory to analyze | |
Output: | |
locations - dictionary mapping each file containing location terms | |
to the terms | |
""" | |
locations = {} | |
tagger = ner.SocketNER(host='localhost', port=8080) | |
for filename in os.listdir(directory): | |
if filename.endswith(".txt"): | |
with open(os.path.join(directory, filename), 'r') as f: | |
text = preprocess(f.read().decode("utf8")) | |
entities = tagger.get_entities(text) | |
if 'LOCATION' in entities: | |
locs = merge_locations(entities['LOCATION'], text) | |
locations[filename] = locs | |
return locations | |
def preprocess(text): | |
"""Make predefined text replacements to aid in named entity | |
recognition. | |
""" | |
subs = { | |
'\n': '. ', | |
'co.': 'County', | |
'Co.': 'County', | |
'county': 'County', | |
'A.T.': 'Arkansas', | |
'M.T.': 'Mississippi' | |
} | |
for from_, to_ in subs.items(): | |
text = text.replace(from_, to_) | |
return text | |
def merge_locations(locs, text): | |
"""Merges all words in locs list that are spaced at most two characters | |
apart in text. (i.e. ", "). Assumes locs are in order in text. | |
""" | |
idx = 0 | |
last_idx = len(locs) - 1 | |
merged = [] | |
while idx <= last_idx: | |
loc = locs[idx] | |
while not idx is last_idx: | |
# "Trims" the text after looking at each location to prevent | |
# indexing the wrong occurence of the location word if it | |
# occurs multiple times in the text. | |
gap, text, merge = gap_length(locs[idx], locs[idx+1], text) | |
if gap <= 2: | |
loc += merge | |
idx += 1 | |
else: | |
break | |
merged.append(loc) | |
idx += 1 | |
return merged | |
def gap_length(word1, word2, text): | |
"""Returns the number of characters after the end of word1 and | |
before the start of word2 in text. Also returns the "trimmed" | |
text with whitespace through word1's position and the | |
merged words expression. | |
""" | |
pos1, pos2 = text.index(word1), text.index(word2) | |
pos1_e, pos2_e = pos1 + len(word1), pos2 + len(word2) | |
gap = pos2 - pos1_e | |
# Substitute characters already looked at with whitespace | |
edited_text = chr(0)*pos1_e + text[pos1_e:] | |
inter_text = text[pos1_e:pos2_e] | |
return gap, edited_text, inter_text | |
def main(): | |
""" | |
Create a json file storing locations dictionary for each | |
directory argument of the script | |
""" | |
if len(sys.argv[1:]): | |
for directory in sys.argv[1:]: | |
locations = locations_tag(directory) | |
with open(directory + '.json', 'w') as f: | |
f.write(json.dumps(locations, | |
indent=4, | |
separators=(',', ': ') | |
) | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment