Skip to content

Instantly share code, notes, and snippets.

@ceshine
Last active April 22, 2020 05:57
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceshine/1f8cd81ce34d89f1429d0928c28d97e4 to your computer and use it in GitHub Desktop.
Save ceshine/1f8cd81ce34d89f1429d0928c28d97e4 to your computer and use it in GitHub Desktop.
A Simple CJK Language Detector
import re
def cjk_detect(texts):
# korean
if re.search("[\uac00-\ud7a3]", texts):
return "ko"
# japanese
if re.search("[\u3040-\u30ff]", texts):
return "ja"
# chinese
if re.search("[\u4e00-\u9FFF]", texts):
return "zh"
return None
def test_cjk_detect():
# Pure English
assert cjk_detect(
"Is Obstruction an Impeachable Offense? History Says Yes") is None
# Pure French
assert cjk_detect(
"Damian Lillard a réussi un nouveau shoot de la victoire"
" au buzzer à très longue distance") is None
# Simplified Chinese
assert cjk_detect(
"2009年,波音公司(Boeing)在查尔斯顿附近的新厂破土动工时,曾宣扬这里是最先进的制造中心"
",将制造一款世界上最先进的飞机。但在接下来的十年里,这家生产787梦想客机的工厂一直受到做"
"工粗糙和监管不力的困扰,危及航空安全。") == "zh"
# Traditional Chinese
assert cjk_detect(
"北查爾斯頓工廠的安全漏洞已經引起了航空公司和監管機構的密切關注。") == "zh"
# Japanese
assert cjk_detect(
"日産自動車は24日、2019年3月期の連結業績予想を下方修正した。") == "ja"
# Korean
assert cjk_detect(
"투서로 뜨고 투서에 지나") == "ko"
# Korean with a Chinese character
assert cjk_detect(
"北 외무성 간부 총살설 주민들 사이서 확산…하노이 회담 실패 때문") == "ko"
def print_incorrect_cases():
# Japanese
texts = "日産自動車、営業益45%減 前期下方修正"
print(texts, "expected: ja actual:", cjk_detect(texts))
# Traditional Chinese with Japanese hiragana
texts = "健康の油切 好吃の涼麵"
print(texts, "expected: zh actual:", cjk_detect(texts))
# Traditional Chinese with Japanese katakana punctuation
texts = "鐵腕・都鐸王朝(五):文藝復興最懂穿搭的高富帥——亨利八世"
print(texts, "expected: zh actual:", cjk_detect(texts))
if __name__ == "__main__":
# Correct cases
test_cjk_detect()
# Incorrect cases
print_incorrect_cases()
@maolopez
Copy link

Thanks a lot! I used part of your solution here https://mauricio-271700.appspot.com/ https://github.com/maolopez/ut_anagramma

@ceshine
Copy link
Author

ceshine commented Apr 22, 2020

Thanks a lot! I used part of your solution here https://mauricio-271700.appspot.com/ https://github.com/maolopez/ut_anagramma

You're welcome! Thanks for letting me know.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment