GNQG/unescape_charref.py

## unescape_charref.py
import re
import sys
from htmlentitydefs import name2codepoint

def unescape_charref(escaped):
    def uchr(c):
        if 0<c<=sys.maxunicode:
            # BMP(UCS-2) / whole(UCS-4)
            return unichr(c)
        elif sys.maxunicode<c<=0x10ffff:
            # SMP(UCS-2) / None(UCS-4)
            c-=0x10000
            return unichr(c>>10|0xD800)+unichr(c&0x3FF|0xDC00)
        else:
            # c==0 or c is out of Unicode
            return ''

    def getunichr(matchobj):
        if matchobj.group('char'):
            # Character entity references
            name=matchobj.group('char')
            if name=='apos':
                # 'apos' is not in htmlentitydefs.name2codepoint
                num=ord("'")
            else:
                num=name2codepoint.get(name,0)
        elif matchobj.group('decimal'):
            # Numeric character reference (decimal)
            num=int(matchobj.group('decimal'))
        elif matchobj.group('hex'):
            # Numeric character reference (hexadecimal)
            num=int(matchobj.group('hex'),16)
        else:
            num=0
        return uchr(num) or matchobj.group()

    return re.sub(
        r'&((?P<char>[a-z]+)|#(?P<decimal>\d+)|#x(?P<hex>[\da-f]+));',
        getunichr,
        escaped,
        flags=re.IGNORECASE
    )

if __name__=='__main__':
    param=sys.argv
    if len(param)==1:
        print 'usage: python unescape_chardef.py *escaped text*'
    else:
        print repr(unescape_charref(' '.join(param[1:]).decode(sys.stdin.encoding)))
	import re
	import sys
	from htmlentitydefs import name2codepoint

	def unescape_charref(escaped):
	def uchr(c):
	if 0<c<=sys.maxunicode:
	# BMP(UCS-2) / whole(UCS-4)
	return unichr(c)
	elif sys.maxunicode<c<=0x10ffff:
	# SMP(UCS-2) / None(UCS-4)
	c-=0x10000
	return unichr(c>>10\|0xD800)+unichr(c&0x3FF\|0xDC00)
	else:
	# c==0 or c is out of Unicode
	return ''

	def getunichr(matchobj):
	if matchobj.group('char'):
	# Character entity references
	name=matchobj.group('char')
	if name=='apos':
	# 'apos' is not in htmlentitydefs.name2codepoint
	num=ord("'")
	else:
	num=name2codepoint.get(name,0)
	elif matchobj.group('decimal'):
	# Numeric character reference (decimal)
	num=int(matchobj.group('decimal'))
	elif matchobj.group('hex'):
	# Numeric character reference (hexadecimal)
	num=int(matchobj.group('hex'),16)
	else:
	num=0
	return uchr(num) or matchobj.group()

	return re.sub(
	r'&((?P<char>[a-z]+)\|#(?P<decimal>\d+)\|#x(?P<hex>[\da-f]+));',
	getunichr,
	escaped,
	flags=re.IGNORECASE
	)

	if __name__=='__main__':
	param=sys.argv
	if len(param)==1:
	print 'usage: python unescape_chardef.py escaped text'
	else:
	print repr(unescape_charref(' '.join(param[1:]).decode(sys.stdin.encoding)))