Skip to content

Instantly share code, notes, and snippets.

@ACEfanatic02
Created June 20, 2013 17:40
Show Gist options
  • Save ACEfanatic02/5824884 to your computer and use it in GitHub Desktop.
Save ACEfanatic02/5824884 to your computer and use it in GitHub Desktop.
normalize_kana
# -*- coding: utf-8 -*-
FULL_KATA_OFFSET = ord(u"ァ") - ord(u"ぁ")
half_kata_to_hira = u"をぁぃぅぇぉゃゅょっーあいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよ"
def normalize_kana(s):
"""Normalizes all kana in the given string to full-width hiragana.
Args:
s String to be normalized.
"""
# Converting to a list allows us to edit the string in-place.
rv = list(s)
for i in xrange(len(rv)):
ch = rv[i]
if not ch:
continue
ordch = ord(ch)
if ordch in xrange(ord(u"ァ"), ord(u"ン") + 1):
# Full-width katakana
rv[i] = unichr(ordch - FULL_KATA_OFFSET)
elif ordch in xrange(ord(u"ヲ"), ord(u"ン") + 1):
# Half-width katakana. This unicode block is laid out differently.
hira = half_kata_to_hira[ordch - ord(u"ヲ")]
if i < len(rv) - 1:
if rv[i + 1] == u"゙":
hira = unichr(ord(hira) + 1)
rv[i + 1] = u""
elif rv[i + 1] == u"゚":
hira = unichr(ord(hira) + 2)
rv[i + 1] = u""
rv[i] = hira
return ''.join(rv)
if __name__ == '__main__':
assert normalize_kana(u"テスト") == u"てすと"
assert normalize_kana(u"リセット") == u"りせっと"
assert normalize_kana(u"ポ") == u"ぽ"
assert normalize_kana(u"ヲ") == u"を"
print "tests pass"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment