Skip to content

Instantly share code, notes, and snippets.

@telendt
Created October 2, 2010 08:28
Show Gist options
  • Save telendt/607454 to your computer and use it in GitHub Desktop.
Save telendt/607454 to your computer and use it in GitHub Desktop.
html_entity_decode in Python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import htmlentitydefs
import re
entity_re = re.compile(r'&(%s|#(\d{1,5}|[xX]([\da-fA-F]{1,4})));' % '|'.join(
htmlentitydefs.name2codepoint.keys()))
def html_entity_decode(s, encoding='ascii'):
"""
Convert all HTML entities to their applicable characters.
Python implementation of http://pl.php.net/html_entity_decode
Unicode object test:
>>> html_entity_decode(u'test: Θ')
u'test: \\u0398'
UTF-8 string test:
>>> html_entity_decode('test: Θ', encoding='utf-8')
'test: \\xce\\x98'
Default (ascii) string test:
>>> html_entity_decode('test: Θ')
'test: Θ'
Test equal entities:
>>> print 62, hex(62), chr(62), unichr(62)
62 0x3e > >
>>> html_entity_decode(u'>') == html_entity_decode('>') == \
html_entity_decode('>', encoding='utf-8') == \
html_entity_decode(u'>') == html_entity_decode('>') == \
html_entity_decode('>', encoding='utf-8') == \
html_entity_decode(u'>') == html_entity_decode('>') == \
html_entity_decode('>', encoding='utf-8')
True
Testing all numeric entities (decimal and hexadecimal) for unicode:
>>> for i in xrange(0x10000):
... assert unichr(i) == html_entity_decode(u'&#%d;' % i) == \
html_entity_decode(u'&#x%s;' % hex(i)[2:])
...
Testing all named (X)HTML entities for unicode:
>>> for key, value in htmlentitydefs.name2codepoint.iteritems():
... assert unichr(value) == html_entity_decode(u'&%s;' % key)
...
Unknown entity test:
>>> html_entity_decode('&unknown;')
'&unknown;'
>>> html_entity_decode('𘚟')
'𘚟'
Wrong argument test:
>>> html_entity_decode([])
Traceback (most recent call last):
...
TypeError: argument 1: expected string, list found
"""
if not isinstance(s, basestring):
raise TypeError('argument 1: expected string, %s found' \
% s.__class__.__name__)
def entity_2_unichr(matchobj):
g1, g2, g3 = matchobj.groups()
if g3 is not None:
codepoint = int(g3, 16)
elif g2 is not None:
codepoint = int(g2)
else:
codepoint = htmlentitydefs.name2codepoint[g1]
return unichr(codepoint)
if isinstance(s, unicode):
entity_2_chr = entity_2_unichr
else:
entity_2_chr = lambda o: entity_2_unichr(o).encode(encoding,
'xmlcharrefreplace')
def silent_entity_replace(matchobj):
try:
return entity_2_chr(matchobj)
except ValueError:
return matchobj.group(0)
return entity_re.sub(silent_entity_replace, s)
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment