Created
October 2, 2010 08:28
Revisions
-
telendt renamed this gist
Mar 4, 2012 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
telendt revised this gist
Oct 2, 2010 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -50,8 +50,8 @@ def html_entity_decode(s, encoding='ascii'): ... Unknown entity test: >>> html_entity_decode('&unknown;') '&unknown;' >>> html_entity_decode('𘚟') '𘚟' -
telendt created this gist
Oct 2, 2010 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,98 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import htmlentitydefs import re entity_re = re.compile(r'&(%s|#(\d{1,5}|[xX]([\da-fA-F]{1,4})));' % '|'.join( htmlentitydefs.name2codepoint.keys())) def html_entity_decode(s, encoding='ascii'): """ Convert all HTML entities to their applicable characters. Python implementation of http://pl.php.net/html_entity_decode Unicode object test: >>> html_entity_decode(u'test: Θ') u'test: \\u0398' UTF-8 string test: >>> html_entity_decode('test: Θ', encoding='utf-8') 'test: \\xce\\x98' Default (ascii) string test: >>> html_entity_decode('test: Θ') 'test: Θ' Test equal entities: >>> print 62, hex(62), chr(62), unichr(62) 62 0x3e > > >>> html_entity_decode(u'>') == html_entity_decode('>') == \ html_entity_decode('>', encoding='utf-8') == \ html_entity_decode(u'>') == html_entity_decode('>') == \ html_entity_decode('>', encoding='utf-8') == \ html_entity_decode(u'>') == html_entity_decode('>') == \ html_entity_decode('>', encoding='utf-8') True Testing all numeric entities (decimal and hexadecimal) for unicode: >>> for i in xrange(0x10000): ... assert unichr(i) == html_entity_decode(u'&#%d;' % i) == \ html_entity_decode(u'&#x%s;' % hex(i)[2:]) ... Testing all named (X)HTML entities for unicode: >>> for key, value in htmlentitydefs.name2codepoint.iteritems(): ... assert unichr(value) == html_entity_decode(u'&%s;' % key) ... Unknown entity test: >>> html_entity_decode('&unknow;') '&unknow;' >>> html_entity_decode('𘚟') '𘚟' Wrong argument test: >>> html_entity_decode([]) Traceback (most recent call last): ... TypeError: argument 1: expected string, list found """ if not isinstance(s, basestring): raise TypeError('argument 1: expected string, %s found' \ % s.__class__.__name__) def entity_2_unichr(matchobj): g1, g2, g3 = matchobj.groups() if g3 is not None: codepoint = int(g3, 16) elif g2 is not None: codepoint = int(g2) else: codepoint = htmlentitydefs.name2codepoint[g1] return unichr(codepoint) if isinstance(s, unicode): entity_2_chr = entity_2_unichr else: entity_2_chr = lambda o: entity_2_unichr(o).encode(encoding, 'xmlcharrefreplace') def silent_entity_replace(matchobj): try: return entity_2_chr(matchobj) except ValueError: return matchobj.group(0) return entity_re.sub(silent_entity_replace, s) def _test(): import doctest doctest.testmod() if __name__ == "__main__": _test()