Created
June 20, 2013 17:40
-
-
Save ACEfanatic02/5824884 to your computer and use it in GitHub Desktop.
normalize_kana
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
FULL_KATA_OFFSET = ord(u"ァ") - ord(u"ぁ") | |
half_kata_to_hira = u"をぁぃぅぇぉゃゅょっーあいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよ" | |
def normalize_kana(s): | |
"""Normalizes all kana in the given string to full-width hiragana. | |
Args: | |
s String to be normalized. | |
""" | |
# Converting to a list allows us to edit the string in-place. | |
rv = list(s) | |
for i in xrange(len(rv)): | |
ch = rv[i] | |
if not ch: | |
continue | |
ordch = ord(ch) | |
if ordch in xrange(ord(u"ァ"), ord(u"ン") + 1): | |
# Full-width katakana | |
rv[i] = unichr(ordch - FULL_KATA_OFFSET) | |
elif ordch in xrange(ord(u"ヲ"), ord(u"ン") + 1): | |
# Half-width katakana. This unicode block is laid out differently. | |
hira = half_kata_to_hira[ordch - ord(u"ヲ")] | |
if i < len(rv) - 1: | |
if rv[i + 1] == u"゙": | |
hira = unichr(ord(hira) + 1) | |
rv[i + 1] = u"" | |
elif rv[i + 1] == u"゚": | |
hira = unichr(ord(hira) + 2) | |
rv[i + 1] = u"" | |
rv[i] = hira | |
return ''.join(rv) | |
if __name__ == '__main__': | |
assert normalize_kana(u"テスト") == u"てすと" | |
assert normalize_kana(u"リセット") == u"りせっと" | |
assert normalize_kana(u"ポ") == u"ぽ" | |
assert normalize_kana(u"ヲ") == u"を" | |
print "tests pass" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment