Skip to content

Instantly share code, notes, and snippets.

@bwhitman
Created January 13, 2016 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bwhitman/f5c3a9c2cb688390afa7 to your computer and use it in GitHub Desktop.
Save bwhitman/f5c3a9c2cb688390afa7 to your computer and use it in GitHub Desktop.
import nltk
import subprocess
from bs4 import BeautifulSoup
_fake_ua = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1"
def curl_something(url, user_agent = _fake_ua, referer = None, cookies_file = "cookies.txt", wait_rand = 0):
cmd = 'curl -L -s %s -A "%s" %s "%s"' % \
('' if not referer else '-referer "' + referer +'"', user_agent, '' if not cookies_file else "-b " + cookies_file, url)
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(html, errs) = p.communicate()
return html
def parse_html(url):
html = curl_something(url)
soup = BeautifulSoup(html, 'lxml')
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def get_entities(text):
people = []
tokens = nltk.tokenize.word_tokenize(text)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
person = []
for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
p = ""
for leaf in subtree.leaves():
p = p + leaf[0] + " "
people.append(p.rstrip())
return people
url = "http://www.nytimes.com/2016/01/12/arts/music/david-bowie-dies-at-69.html"
print get_entities(parse_html(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment