Created
June 18, 2015 05:23
-
-
Save riceissa/1b14e354ab264658a476 to your computer and use it in GitHub Desktop.
filter out Chinese chars
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# From http://stackoverflow.com/a/9169489 | |
c1 = (set(range(0x4E00, 0xA000)) | | |
set(range(0x3400, 0x4DC0)) | | |
set(range(0x20000, 0x2A6E0)) | | |
set(range(0x2A700, 0x2B740)) | | |
set(range(0x2B740, 0x2B820)) | | |
set(range(0xF900, 0xFB00)) | | |
set(range(0x2F800, 0x2FA20)) | | |
set(range(0x9FA6, 0x9FCC)) | |
) | |
# From http://stackoverflow.com/a/2718268 | |
LHan = [[0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP | |
[0x2E9B, 0x2EF3], # Han # So [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE | |
[0x2F00, 0x2FD5], # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE | |
0x3005, # Han # Lm IDEOGRAPHIC ITERATION MARK | |
0x3007, # Han # Nl IDEOGRAPHIC NUMBER ZERO | |
[0x3021, 0x3029], # Han # Nl [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE | |
[0x3038, 0x303A], # Han # Nl [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY | |
0x303B, # Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK | |
[0x3400, 0x4DB5], # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5 | |
[0x4E00, 0x9FC3], # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3 | |
[0xF900, 0xFA2D], # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D | |
[0xFA30, 0xFA6A], # Han # Lo [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A | |
[0xFA70, 0xFAD9], # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9 | |
[0x20000, 0x2A6D6], # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6 | |
[0x2F800, 0x2FA1D]] # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D | |
c2 = set() | |
for i in LHan: | |
if isinstance(i, list): | |
l, u = i | |
c2 = c2 | set(range(l, u+1)) | |
else: | |
c2.add(i) | |
if __name__ == "__main__": | |
import argparse | |
import sys | |
parser = argparse.ArgumentParser( | |
description="Filter out Chinese characters") | |
parser.add_argument("input_file", nargs="?", | |
type=argparse.FileType("r"), default=sys.stdin, | |
help="input file; default to stdin") | |
args = parser.parse_args() | |
for line in args.input_file: | |
if not any(ord(i) in c1 | c2 for i in line): | |
sys.stdout.write(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment