Skip to content

Instantly share code, notes, and snippets.

@89465127
Last active December 14, 2015 17:19
Show Gist options
  • Save 89465127/5121468 to your computer and use it in GitHub Desktop.
Save 89465127/5121468 to your computer and use it in GitHub Desktop.
Wikipedia glossary generator
#! /usr/bin/python
import bs4
import json
import urllib2
import html2text
import nltk
import collections
# to do, lets turn this into a full class (that can be imported)
# and use argparse (which invokes serializing it to a file)
# might be nice to be able to kick this thing off in 3 ways:
# full url
# a properly formed title
# a search term -- could use google's site:en.wikipedia.org [term]
# (it should be able to parse and figure this out for you)
# given a url, get me a definition
# given a url, get me a list of links, for which to get all definitions
class Glossary():
def __init__(self, term):
# to do -- check term and handle 3 cases to create a url.
# self.url = term
self.title = term
self.definitions = collections.OrderedDict()
for title, url in self.get_links().iteritems():
anchor_tag = "<a href='{url}'>{title}</a>".format(url=url, title=title)
self.definitions[anchor_tag] = self.get_definition(url)
def get_definition(self, url):
request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) # because Wikipedia blocks Python
soup = bs4.BeautifulSoup(urllib2.urlopen(request)) #.read()
article_html = str(soup.find('div', attrs={"id": "mw-content-text"}).p)
h = html2text.HTML2Text()
h.body_width = 0 # no line wrapping
h.ignore_links = True
h.ignore_emphasis = True
h.ignore_images = True
# h.unicode_snob = True
article_text = h.handle(article_html.decode('utf-8'))
# While nltk offers a more concise tag stripper, it inserts extra spaces.
# article_text = nltk.clean_html(article_html)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
article_sentences = tokenizer.tokenize(article_text)
return article_sentences[0]
def get_links(self, limit=3):
#do we want to use a page term or page id? Not sure it actaully makes sense to use id at this point...
url = "http://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&generator=links&prop=info&inprop=url&gpllimit={limit}".format(title=self.title, limit=limit)
#print urllib2.urlopen(url).read()
raw_links = json.load(urllib2.urlopen(url))['query']['pages']
links = {}
for pageid, values in raw_links.iteritems():
links[values['title']] = values['fullurl']
return links
def __str__(self):
s = ''
for title, definition in self.definitions.iteritems():
s += "\n{}\n\t{}".format(title, definition)
return s
def main():
g = Glossary('Link_farm')
print g
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment