Skip to content

Instantly share code, notes, and snippets.

@orico
Created April 9, 2019 11:10
Show Gist options
  • Save orico/a77ccddc8e74425441c57f944b17c809 to your computer and use it in GitHub Desktop.
Save orico/a77ccddc8e74425441c57f944b17c809 to your computer and use it in GitHub Desktop.
Stanford NER Location old API
import pandas as pd
import numpy as np
#nltk v1
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
path_to_data = './data/'
cities = pd.read_csv(path_to_data + 'us_cities_states_counties.csv')
cities['City alias'] = cities['City alias'].apply(lambda x: str(x))
def formatted_entities(classified_paragraphs_list):
entities = []
for classified_paragraph in classified_paragraphs_list:
for entry in classified_paragraph:
entry_value = entry[0]
entry_type = entry[1]
if entry_type == 'LOCATION':
entities.append(entry_value)
return entities
tagger = StanfordNERTagger(path_to_data+'english.all.3class.distsim.crf.ser.gz',
path_to_data+'stanford-ner.jar',
encoding='utf-8')
count = 0
passed = 0
for i, city in enumerate(cities['City alias'].values):
try:
city_ = word_tokenize(city)
classified_paragraphs_list = tagger.tag_sents([city_])
formatted_result = formatted_entities(classified_paragraphs_list)
if len(formatted_result)>0:
count+=1
except Exception as e:
passed +=1
print(i, city, 'error:', e)
pass
if i% 100 == 0: print (i, count, passed, city, city_, 'result:', ' '.join(formatted_result))
print(f'Stanford knows {count} out of {cities.shape[0]}')
print('couldnt process:', passed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment