Last active
June 30, 2017 04:56
-
-
Save yingminc/b532b54f9c05b1c74cf0b8ee5e05f8ea to your computer and use it in GitHub Desktop.
character type check for japanese text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
import re | |
import argparse | |
import os | |
parser =argparse.ArgumentParser() | |
parser.add_argument('input_txt', help = 'the string input') | |
args = parser.parse_args() | |
def chartype(text): | |
num = re.compile('^[0-9]$') | |
alphabet = re.compile('^[A-Za-z]$') | |
Hira = re.compile(u'^[\u3040-\u309F]$') | |
Kata = re.compile(u'^[\u30A0-\u30FF]$') | |
Kanji = re.compile(u'^[\u4E00-\u9FFF]$') | |
text = text.decode('utf-8') | |
charlist = [] | |
for char in text: | |
if num.match(char)!= None: | |
chart = [char, 'Number'] | |
elif alphabet.match(char)!= None: | |
chart = [char, 'Aphabet'] | |
elif Hira.match(char)!= None: | |
chart = [char, 'Hiragana'] | |
elif Kata.match(char)!= None: | |
chart = [char, 'Katakana'] | |
elif Kanji.match(char)!= None: | |
chart = [char, 'Kanji'] | |
else: | |
chart = [char, 'other character'] | |
charlist.append(chart) | |
print charlist | |
return charlist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment