Skip to content

Instantly share code, notes, and snippets.

@matthewcornell
Forked from onyxfish/example1.py
Last active September 16, 2015 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthewcornell/8f7f036b87961e9f1acd to your computer and use it in GitHub Desktop.
Save matthewcornell/8f7f036b87961e9f1acd to your computer and use it in GitHub Desktop.
Basic example of using NLTK for name entity extraction.
# forked for my own reference
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs
import nltk
def extract_named_ents_from_url(url):
url_bytes = get_bytes_for_url(url)
soup = bs(url_bytes, "lxml")
sentences = nltk.sent_tokenize(soup.text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
named_entities = []
for chunked_sentence in chunked_sentences:
return entity_names_for_tree(chunked_sentence)
return named_entities
def get_bytes_for_url(url):
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
httpResponse = urllib.request.urlopen(req)
return httpResponse.read() # todo encoding?
def entity_names_for_tree(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(entity_names_for_tree(child))
return entity_names
named_entities = extract_named_ents_from_url('http://www.omdurman.org/columns/obama12.html')
print(named_entities)
# ['Widespread Consensus', 'Obama Promotes', 'Racism', 'Hatred', 'Israel', 'Western Civilization Home', 'Obama', 'Racism', 'Hatred', 'Israel', 'Richard Cohen', 'Brigitte Gabriel', 'Sharon Hughes', 'Charles Krauthammer', 'Kenneth Blackwell', 'Naomi Ragen', 'Debbie Schlussel', 'Ed Lasky', 'William Levinson', 'Barack', 'Catholics', 'Israel', 'United States']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment