89465127/gloss.py

## gloss.py
#! /usr/bin/python
import bs4
import json
import urllib2
import html2text
import nltk
import collections

# to do, lets turn this into a full class (that can be imported)
# and use argparse (which invokes serializing it to a file)

# might be nice to be able to kick this thing off in 3 ways:
  # full url
  # a properly formed title
  # a search term -- could use google's site:en.wikipedia.org [term]
  # (it should be able to parse and figure this out for you)
# given a url, get me a definition
# given a url, get me a list of links, for which to get all definitions

class Glossary():
    def __init__(self, term):
        # to do -- check term and handle 3 cases to create a url.
        # self.url = term
        self.title = term
        self.definitions = collections.OrderedDict()
        for title, url in self.get_links().iteritems():
            anchor_tag = "<a href='{url}'>{title}</a>".format(url=url, title=title)
            self.definitions[anchor_tag] = self.get_definition(url)

    def get_definition(self, url):
        request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) # because Wikipedia blocks Python
        soup = bs4.BeautifulSoup(urllib2.urlopen(request)) #.read()
        article_html = str(soup.find('div', attrs={"id": "mw-content-text"}).p)

        h = html2text.HTML2Text()
        h.body_width = 0 # no line wrapping
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        # h.unicode_snob = True
        article_text = h.handle(article_html.decode('utf-8'))

        # While nltk offers a more concise tag stripper, it inserts extra spaces.
        # article_text = nltk.clean_html(article_html)

        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        article_sentences = tokenizer.tokenize(article_text)
        return article_sentences[0]

    def get_links(self, limit=3):
        #do we want to use a page term or page id? Not sure it actaully makes sense to use id at this point...
        url = "http://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&generator=links&prop=info&inprop=url&gpllimit={limit}".format(title=self.title, limit=limit)
        #print urllib2.urlopen(url).read()
        raw_links = json.load(urllib2.urlopen(url))['query']['pages']
        links = {}
        for pageid, values in raw_links.iteritems():
            links[values['title']] = values['fullurl']
        return links

    def __str__(self):
        s = ''
        for title, definition in self.definitions.iteritems():
            s += "\n{}\n\t{}".format(title, definition)
        return s

def main():
    g = Glossary('Link_farm')
    print g

if __name__ == '__main__':
    main()
	#! /usr/bin/python
	import bs4
	import json
	import urllib2
	import html2text
	import nltk
	import collections

	# to do, lets turn this into a full class (that can be imported)
	# and use argparse (which invokes serializing it to a file)

	# might be nice to be able to kick this thing off in 3 ways:
	# full url
	# a properly formed title
	# a search term -- could use google's site:en.wikipedia.org [term]
	# (it should be able to parse and figure this out for you)
	# given a url, get me a definition
	# given a url, get me a list of links, for which to get all definitions

	class Glossary():
	def __init__(self, term):
	# to do -- check term and handle 3 cases to create a url.
	# self.url = term
	self.title = term
	self.definitions = collections.OrderedDict()
	for title, url in self.get_links().iteritems():
	anchor_tag = "<a href='{url}'>{title}</a>".format(url=url, title=title)
	self.definitions[anchor_tag] = self.get_definition(url)

	def get_definition(self, url):
	request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) # because Wikipedia blocks Python
	soup = bs4.BeautifulSoup(urllib2.urlopen(request)) #.read()
	article_html = str(soup.find('div', attrs={"id": "mw-content-text"}).p)

	h = html2text.HTML2Text()
	h.body_width = 0 # no line wrapping
	h.ignore_links = True
	h.ignore_emphasis = True
	h.ignore_images = True
	# h.unicode_snob = True
	article_text = h.handle(article_html.decode('utf-8'))

	# While nltk offers a more concise tag stripper, it inserts extra spaces.
	# article_text = nltk.clean_html(article_html)

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	article_sentences = tokenizer.tokenize(article_text)
	return article_sentences[0]

	def get_links(self, limit=3):
	#do we want to use a page term or page id? Not sure it actaully makes sense to use id at this point...
	url = "http://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&generator=links&prop=info&inprop=url&gpllimit={limit}".format(title=self.title, limit=limit)
	#print urllib2.urlopen(url).read()
	raw_links = json.load(urllib2.urlopen(url))['query']['pages']
	links = {}
	for pageid, values in raw_links.iteritems():
	links[values['title']] = values['fullurl']
	return links

	def __str__(self):
	s = ''
	for title, definition in self.definitions.iteritems():
	s += "\n{}\n\t{}".format(title, definition)
	return s

	def main():
	g = Glossary('Link_farm')
	print g

	if __name__ == '__main__':
	main()