zed/urbandictionary.py

## urbandictionary.py
#!/usr/bin/env python
"""Lookup word definitions on http://www.urbandictionary.com

Print the result as json list.

Example:

    $ python -murbandictionary Honorificabilitudinitatibus

Output:

    [
      {
        "word": "honorificabilitudinitatibus",
        "example": "\nFor thou art not so long by the head as honorificabilitudinitatibus: thou art easier swallowed than a flap-dragon.\n",
        "def": "\nhonorableness\n\nUsed by Shakespeare in Loves Labors Lost (Costartd; Act V Scene I)\n"
      }
    ]

Based on https://github.com/novel/py-urbandict/blob/e2c7a2cccdd0067573429965d1d955bad0382cb3/urbandict.py
"""
from itertools import count

try:
    from html.parser import HTMLParser
    from urllib.request import urlopen, Request
    from urllib.parse import quote as urlquote
except ImportError: # Python 2
    from HTMLParser import HTMLParser
    from urllib2 import urlopen, Request
    from urllib import quote as urlquote

__all__ = ["UrbanDictParser", "lookup"]

__version__ = '0.1.2'

class UrbanDictParser(HTMLParser):
    """Extract word definitions from html."""
    def __init__(self, *args, **kwargs):
        data = kwargs.pop('data', None)
        HTMLParser.__init__(self, *args, **kwargs)
        self.translations = [] # for compatibility with urbandict.py
        self._section = None
        if data is not None:
            self.feed(data)

    def handle_starttag(self, tag, attrs):
        if tag == 'div':
            klass = dict(attrs).get('class')
            if klass in ('word', 'meaning', 'example'):
                self._section = klass
                if klass == 'word': #NOTE: assume 'word' is the first section
                    self.translations.append(
                        {'word': '', 'def': '', 'example': ''})

    def handle_endtag(self, tag):
        if tag == 'div':
            #NOTE: assume there is no nested <div> in the known sections
            self._section = None

    def handle_data(self, data):
        if self._section:
            if self._section == 'meaning':
                self._section = 'def' # for compatibility with urbandict.py
            elif self._section == 'word':
                data = data.strip() # for compatibility with urbandict.py
            self.translations[-1][self._section] += normalize_newlines(data)

    def close(self):
        HTMLParser.close(self)
        return self.translations

def normalize_newlines(text):
    return text.replace('\r\n', '\n').replace('\r', '\n')

def lookup(word, maxpages=None):
    """Lookup word definitions on urbandictionary.com

    Non-zero maxpages specifies how many pages to download
    (default: unlimited).
    """
    url = 'http://www.urbandictionary.com/define.php?term={}&page='.format(
        urlquote(word))
    headers = {'User-Agent': 'urbandictionary.py/' + __version__}
    result = []
    for page in range(1, maxpages + 1) if maxpages else count(1):
        try:
            r = urlopen(Request(url + str(page), headers=headers))
        except EnvironmentError:
            break
        else:
            if 'page' not in r.url: # non-existing page
                break
            p = UrbanDictParser(data=r.read().decode('utf-8'))
            result.extend(p.close())
    return result

if __name__=="__main__":
    import json
    import sys
    definitions = lookup(sys.argv[1])
    json.dump(definitions, sys.stdout, indent=2)
	#!/usr/bin/env python
	"""Lookup word definitions on http://www.urbandictionary.com

	Print the result as json list.

	Example:

	$ python -murbandictionary Honorificabilitudinitatibus

	Output:

	[
	{
	"word": "honorificabilitudinitatibus",
	"example": "\nFor thou art not so long by the head as honorificabilitudinitatibus: thou art easier swallowed than a flap-dragon.\n",
	"def": "\nhonorableness\n\nUsed by Shakespeare in Loves Labors Lost (Costartd; Act V Scene I)\n"
	}
	]

	Based on https://github.com/novel/py-urbandict/blob/e2c7a2cccdd0067573429965d1d955bad0382cb3/urbandict.py
	"""
	from itertools import count

	try:
	from html.parser import HTMLParser
	from urllib.request import urlopen, Request
	from urllib.parse import quote as urlquote
	except ImportError: # Python 2
	from HTMLParser import HTMLParser
	from urllib2 import urlopen, Request
	from urllib import quote as urlquote

	__all__ = ["UrbanDictParser", "lookup"]

	__version__ = '0.1.2'

	class UrbanDictParser(HTMLParser):
	"""Extract word definitions from html."""
	def __init__(self, args, *kwargs):
	data = kwargs.pop('data', None)
	HTMLParser.__init__(self, args, *kwargs)
	self.translations = [] # for compatibility with urbandict.py
	self._section = None
	if data is not None:
	self.feed(data)

	def handle_starttag(self, tag, attrs):
	if tag == 'div':
	klass = dict(attrs).get('class')
	if klass in ('word', 'meaning', 'example'):
	self._section = klass
	if klass == 'word': #NOTE: assume 'word' is the first section
	self.translations.append(
	{'word': '', 'def': '', 'example': ''})

	def handle_endtag(self, tag):
	if tag == 'div':
	#NOTE: assume there is no nested <div> in the known sections
	self._section = None

	def handle_data(self, data):
	if self._section:
	if self._section == 'meaning':
	self._section = 'def' # for compatibility with urbandict.py
	elif self._section == 'word':
	data = data.strip() # for compatibility with urbandict.py
	self.translations[-1][self._section] += normalize_newlines(data)

	def close(self):
	HTMLParser.close(self)
	return self.translations

	def normalize_newlines(text):
	return text.replace('\r\n', '\n').replace('\r', '\n')

	def lookup(word, maxpages=None):
	"""Lookup word definitions on urbandictionary.com

	Non-zero maxpages specifies how many pages to download
	(default: unlimited).
	"""
	url = 'http://www.urbandictionary.com/define.php?term={}&page='.format(
	urlquote(word))
	headers = {'User-Agent': 'urbandictionary.py/' + __version__}
	result = []
	for page in range(1, maxpages + 1) if maxpages else count(1):
	try:
	r = urlopen(Request(url + str(page), headers=headers))
	except EnvironmentError:
	break
	else:
	if 'page' not in r.url: # non-existing page
	break
	p = UrbanDictParser(data=r.read().decode('utf-8'))
	result.extend(p.close())
	return result

	if __name__=="__main__":
	import json
	import sys
	definitions = lookup(sys.argv[1])
	json.dump(definitions, sys.stdout, indent=2)