Last active
June 8, 2016 11:59
-
-
Save hexists/8aba80f4425d1a9230ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
NUMERIC_CODE_PATTERN = re.compile(r'&#(\d+);') | |
def euc_to_utf(string): | |
''' | |
euc_to_utf('쿠보즈카 슌스케') | |
쿠보즈카 슌스케 | |
''' | |
if string is None or 0 == len(string.strip()): | |
return None | |
string = (unicode(string, 'cp949')).encode('utf-8') | |
numeric_code_patterns = re.findall(NUMERIC_CODE_PATTERN, string) | |
# print numeric_code_patterns | |
for pat in numeric_code_patterns: | |
string = string.replace('&#%s;' % pat, unichr(int(pat)).encode('utf8')) | |
return string | |
if __name__ == '__main__': | |
for buf in sys.stdin: | |
line = buf.rstrip() | |
utf_string = euc_to_utf(line) | |
print utf_string |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment