Last active
August 29, 2015 13:57
-
-
Save zed/9903390 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Lookup word definitions on http://www.urbandictionary.com | |
Print the result as json list. | |
Example: | |
$ python -murbandictionary Honorificabilitudinitatibus | |
Output: | |
[ | |
{ | |
"word": "honorificabilitudinitatibus", | |
"example": "\nFor thou art not so long by the head as honorificabilitudinitatibus: thou art easier swallowed than a flap-dragon.\n", | |
"def": "\nhonorableness\n\nUsed by Shakespeare in Loves Labors Lost (Costartd; Act V Scene I)\n" | |
} | |
] | |
Based on https://github.com/novel/py-urbandict/blob/e2c7a2cccdd0067573429965d1d955bad0382cb3/urbandict.py | |
""" | |
from itertools import count | |
try: | |
from html.parser import HTMLParser | |
from urllib.request import urlopen, Request | |
from urllib.parse import quote as urlquote | |
except ImportError: # Python 2 | |
from HTMLParser import HTMLParser | |
from urllib2 import urlopen, Request | |
from urllib import quote as urlquote | |
__all__ = ["UrbanDictParser", "lookup"] | |
__version__ = '0.1.2' | |
class UrbanDictParser(HTMLParser): | |
"""Extract word definitions from html.""" | |
def __init__(self, *args, **kwargs): | |
data = kwargs.pop('data', None) | |
HTMLParser.__init__(self, *args, **kwargs) | |
self.translations = [] # for compatibility with urbandict.py | |
self._section = None | |
if data is not None: | |
self.feed(data) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'div': | |
klass = dict(attrs).get('class') | |
if klass in ('word', 'meaning', 'example'): | |
self._section = klass | |
if klass == 'word': #NOTE: assume 'word' is the first section | |
self.translations.append( | |
{'word': '', 'def': '', 'example': ''}) | |
def handle_endtag(self, tag): | |
if tag == 'div': | |
#NOTE: assume there is no nested <div> in the known sections | |
self._section = None | |
def handle_data(self, data): | |
if self._section: | |
if self._section == 'meaning': | |
self._section = 'def' # for compatibility with urbandict.py | |
elif self._section == 'word': | |
data = data.strip() # for compatibility with urbandict.py | |
self.translations[-1][self._section] += normalize_newlines(data) | |
def close(self): | |
HTMLParser.close(self) | |
return self.translations | |
def normalize_newlines(text): | |
return text.replace('\r\n', '\n').replace('\r', '\n') | |
def lookup(word, maxpages=None): | |
"""Lookup word definitions on urbandictionary.com | |
Non-zero maxpages specifies how many pages to download | |
(default: unlimited). | |
""" | |
url = 'http://www.urbandictionary.com/define.php?term={}&page='.format( | |
urlquote(word)) | |
headers = {'User-Agent': 'urbandictionary.py/' + __version__} | |
result = [] | |
for page in range(1, maxpages + 1) if maxpages else count(1): | |
try: | |
r = urlopen(Request(url + str(page), headers=headers)) | |
except EnvironmentError: | |
break | |
else: | |
if 'page' not in r.url: # non-existing page | |
break | |
p = UrbanDictParser(data=r.read().decode('utf-8')) | |
result.extend(p.close()) | |
return result | |
if __name__=="__main__": | |
import json | |
import sys | |
definitions = lookup(sys.argv[1]) | |
json.dump(definitions, sys.stdout, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment