Skip to content

Instantly share code, notes, and snippets.

@br0nstein
Created April 25, 2014 12:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save br0nstein/11288492 to your computer and use it in GitHub Desktop.
Save br0nstein/11288492 to your computer and use it in GitHub Desktop.
Generates a JSON file of all location names in each text file in a given directory
import os
import sys
import json
import ner # pyner
def locations_tag(directory):
"""
Finds location terms in all text files in a given directory
Input:
directory - string representing the local directory to analyze
Output:
locations - dictionary mapping each file containing location terms
to the terms
"""
locations = {}
tagger = ner.SocketNER(host='localhost', port=8080)
for filename in os.listdir(directory):
if filename.endswith(".txt"):
with open(os.path.join(directory, filename), 'r') as f:
text = preprocess(f.read().decode("utf8"))
entities = tagger.get_entities(text)
if 'LOCATION' in entities:
locs = merge_locations(entities['LOCATION'], text)
locations[filename] = locs
return locations
def preprocess(text):
"""Make predefined text replacements to aid in named entity
recognition.
"""
subs = {
'\n': '. ',
'co.': 'County',
'Co.': 'County',
'county': 'County',
'A.T.': 'Arkansas',
'M.T.': 'Mississippi'
}
for from_, to_ in subs.items():
text = text.replace(from_, to_)
return text
def merge_locations(locs, text):
"""Merges all words in locs list that are spaced at most two characters
apart in text. (i.e. ", "). Assumes locs are in order in text.
"""
idx = 0
last_idx = len(locs) - 1
merged = []
while idx <= last_idx:
loc = locs[idx]
while not idx is last_idx:
# "Trims" the text after looking at each location to prevent
# indexing the wrong occurence of the location word if it
# occurs multiple times in the text.
gap, text, merge = gap_length(locs[idx], locs[idx+1], text)
if gap <= 2:
loc += merge
idx += 1
else:
break
merged.append(loc)
idx += 1
return merged
def gap_length(word1, word2, text):
"""Returns the number of characters after the end of word1 and
before the start of word2 in text. Also returns the "trimmed"
text with whitespace through word1's position and the
merged words expression.
"""
pos1, pos2 = text.index(word1), text.index(word2)
pos1_e, pos2_e = pos1 + len(word1), pos2 + len(word2)
gap = pos2 - pos1_e
# Substitute characters already looked at with whitespace
edited_text = chr(0)*pos1_e + text[pos1_e:]
inter_text = text[pos1_e:pos2_e]
return gap, edited_text, inter_text
def main():
"""
Create a json file storing locations dictionary for each
directory argument of the script
"""
if len(sys.argv[1:]):
for directory in sys.argv[1:]:
locations = locations_tag(directory)
with open(directory + '.json', 'w') as f:
f.write(json.dumps(locations,
indent=4,
separators=(',', ': ')
)
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment