Skip to content

Instantly share code, notes, and snippets.

@yingminc
Last active June 30, 2017 04:56
Show Gist options
  • Save yingminc/b532b54f9c05b1c74cf0b8ee5e05f8ea to your computer and use it in GitHub Desktop.
Save yingminc/b532b54f9c05b1c74cf0b8ee5e05f8ea to your computer and use it in GitHub Desktop.
character type check for japanese text
#encoding: utf-8
import re
import argparse
import os
parser =argparse.ArgumentParser()
parser.add_argument('input_txt', help = 'the string input')
args = parser.parse_args()
def chartype(text):
num = re.compile('^[0-9]$')
alphabet = re.compile('^[A-Za-z]$')
Hira = re.compile(u'^[\u3040-\u309F]$')
Kata = re.compile(u'^[\u30A0-\u30FF]$')
Kanji = re.compile(u'^[\u4E00-\u9FFF]$')
text = text.decode('utf-8')
charlist = []
for char in text:
if num.match(char)!= None:
chart = [char, 'Number']
elif alphabet.match(char)!= None:
chart = [char, 'Aphabet']
elif Hira.match(char)!= None:
chart = [char, 'Hiragana']
elif Kata.match(char)!= None:
chart = [char, 'Katakana']
elif Kanji.match(char)!= None:
chart = [char, 'Kanji']
else:
chart = [char, 'other character']
charlist.append(chart)
print charlist
return charlist
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment