bwhitman/recog.py

## recog.py
import nltk
import subprocess
from bs4 import BeautifulSoup

_fake_ua = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1"

def curl_something(url, user_agent = _fake_ua, referer = None, cookies_file = "cookies.txt", wait_rand = 0):
    cmd = 'curl -L -s %s -A "%s" %s "%s"' % \
        ('' if not referer else '-referer "' + referer +'"', user_agent, '' if not cookies_file else "-b " + cookies_file, url)
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    (html, errs) = p.communicate()
    return html

def parse_html(url):
    html = curl_something(url)
    soup = BeautifulSoup(html, 'lxml')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def get_entities(text):
    people = []
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person = []
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        p = ""
        for leaf in subtree.leaves():
            p = p + leaf[0] + " "
        people.append(p.rstrip())
    return people


url = "http://www.nytimes.com/2016/01/12/arts/music/david-bowie-dies-at-69.html"
print get_entities(parse_html(url))
	import nltk
	import subprocess
	from bs4 import BeautifulSoup

	_fake_ua = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1"

	def curl_something(url, user_agent = _fake_ua, referer = None, cookies_file = "cookies.txt", wait_rand = 0):
	cmd = 'curl -L -s %s -A "%s" %s "%s"' % \
	('' if not referer else '-referer "' + referer +'"', user_agent, '' if not cookies_file else "-b " + cookies_file, url)
	p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	(html, errs) = p.communicate()
	return html

	def parse_html(url):
	html = curl_something(url)
	soup = BeautifulSoup(html, 'lxml')
	for script in soup(["script", "style"]):
	script.extract()
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)
	return text

	def get_entities(text):
	people = []
	tokens = nltk.tokenize.word_tokenize(text)
	pos = nltk.pos_tag(tokens)
	sentt = nltk.ne_chunk(pos, binary = False)
	person = []
	for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
	p = ""
	for leaf in subtree.leaves():
	p = p + leaf[0] + " "
	people.append(p.rstrip())
	return people


	url = "http://www.nytimes.com/2016/01/12/arts/music/david-bowie-dies-at-69.html"
	print get_entities(parse_html(url))