Created
April 9, 2019 11:10
-
-
Save orico/a77ccddc8e74425441c57f944b17c809 to your computer and use it in GitHub Desktop.
Stanford NER Location old API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
#nltk v1 | |
from nltk.tag.stanford import StanfordNERTagger | |
from nltk.tokenize import word_tokenize | |
path_to_data = './data/' | |
cities = pd.read_csv(path_to_data + 'us_cities_states_counties.csv') | |
cities['City alias'] = cities['City alias'].apply(lambda x: str(x)) | |
def formatted_entities(classified_paragraphs_list): | |
entities = [] | |
for classified_paragraph in classified_paragraphs_list: | |
for entry in classified_paragraph: | |
entry_value = entry[0] | |
entry_type = entry[1] | |
if entry_type == 'LOCATION': | |
entities.append(entry_value) | |
return entities | |
tagger = StanfordNERTagger(path_to_data+'english.all.3class.distsim.crf.ser.gz', | |
path_to_data+'stanford-ner.jar', | |
encoding='utf-8') | |
count = 0 | |
passed = 0 | |
for i, city in enumerate(cities['City alias'].values): | |
try: | |
city_ = word_tokenize(city) | |
classified_paragraphs_list = tagger.tag_sents([city_]) | |
formatted_result = formatted_entities(classified_paragraphs_list) | |
if len(formatted_result)>0: | |
count+=1 | |
except Exception as e: | |
passed +=1 | |
print(i, city, 'error:', e) | |
pass | |
if i% 100 == 0: print (i, count, passed, city, city_, 'result:', ' '.join(formatted_result)) | |
print(f'Stanford knows {count} out of {cities.shape[0]}') | |
print('couldnt process:', passed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment