Create a gist now

Instantly share code, notes, and snippets.

unescaping character entity references / numeric character references with Python2.x
import re
import sys
from htmlentitydefs import name2codepoint
def unescape_charref(escaped):
def uchr(c):
if 0<c<=sys.maxunicode:
# BMP(UCS-2) / whole(UCS-4)
return unichr(c)
elif sys.maxunicode<c<=0x10ffff:
# SMP(UCS-2) / None(UCS-4)
c-=0x10000
return unichr(c>>10|0xD800)+unichr(c&0x3FF|0xDC00)
else:
# c==0 or c is out of Unicode
return ''
def getunichr(matchobj):
if matchobj.group('char'):
# Character entity references
name=matchobj.group('char')
if name=='apos':
# 'apos' is not in htmlentitydefs.name2codepoint
num=ord("'")
else:
num=name2codepoint.get(name,0)
elif matchobj.group('decimal'):
# Numeric character reference (decimal)
num=int(matchobj.group('decimal'))
elif matchobj.group('hex'):
# Numeric character reference (hexadecimal)
num=int(matchobj.group('hex'),16)
else:
num=0
return uchr(num) or matchobj.group()
return re.sub(
r'&((?P<char>[a-z]+)|#(?P<decimal>\d+)|#x(?P<hex>[\da-f]+));',
getunichr,
escaped,
flags=re.IGNORECASE
)
if __name__=='__main__':
param=sys.argv
if len(param)==1:
print 'usage: python unescape_chardef.py *escaped text*'
else:
print repr(unescape_charref(' '.join(param[1:]).decode(sys.stdin.encoding)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment