Created
September 19, 2016 04:53
-
-
Save neet/461825afb41b11cf49867fb9ec503c0c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!-*- coding:utf-8 -*- | |
import htmlentitydefs | |
import re | |
# 実体参照 & 文字参照を通常の文字に戻す | |
def htmlentity2unicode(text): | |
# 正規表現のコンパイル | |
reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE) | |
num16_regex = re.compile(u'#x\d+', re.IGNORECASE) | |
num10_regex = re.compile(u'#\d+', re.IGNORECASE) | |
result = u'' | |
i = 0 | |
while True: | |
# 実体参照 or 文字参照を見つける | |
match = reference_regex.search(text, i) | |
if match is None: | |
result += text[i:] | |
break | |
result += text[i:match.start()] | |
i = match.end() | |
name = match.group(1) | |
# 実体参照 | |
if name in htmlentitydefs.name2codepoint.keys(): | |
result += unichr(htmlentitydefs.name2codepoint[name]) | |
# 文字参照 | |
elif num16_regex.match(name): | |
# 16進数 | |
result += unichr(int(u'0'+name[1:], 16)) | |
elif num10_regex.match(name): | |
# 10進数 | |
result += unichr(int(name[1:])) | |
return result | |
# テストコード | |
text = u"文字参照 & 実体参照 を通常の文字に戻します。"; | |
print htmlentity2unicode(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment