Skip to content

Instantly share code, notes, and snippets.

@riceissa
Created June 18, 2015 05:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riceissa/1b14e354ab264658a476 to your computer and use it in GitHub Desktop.
Save riceissa/1b14e354ab264658a476 to your computer and use it in GitHub Desktop.
filter out Chinese chars
#!/usr/bin/python3
# From http://stackoverflow.com/a/9169489
c1 = (set(range(0x4E00, 0xA000)) |
set(range(0x3400, 0x4DC0)) |
set(range(0x20000, 0x2A6E0)) |
set(range(0x2A700, 0x2B740)) |
set(range(0x2B740, 0x2B820)) |
set(range(0xF900, 0xFB00)) |
set(range(0x2F800, 0x2FA20)) |
set(range(0x9FA6, 0x9FCC))
)
# From http://stackoverflow.com/a/2718268
LHan = [[0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP
[0x2E9B, 0x2EF3], # Han # So [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
[0x2F00, 0x2FD5], # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
0x3005, # Han # Lm IDEOGRAPHIC ITERATION MARK
0x3007, # Han # Nl IDEOGRAPHIC NUMBER ZERO
[0x3021, 0x3029], # Han # Nl [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
[0x3038, 0x303A], # Han # Nl [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
0x303B, # Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
[0x3400, 0x4DB5], # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
[0x4E00, 0x9FC3], # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
[0xF900, 0xFA2D], # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
[0xFA30, 0xFA6A], # Han # Lo [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
[0xFA70, 0xFAD9], # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
[0x20000, 0x2A6D6], # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
[0x2F800, 0x2FA1D]] # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D
c2 = set()
for i in LHan:
if isinstance(i, list):
l, u = i
c2 = c2 | set(range(l, u+1))
else:
c2.add(i)
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser(
description="Filter out Chinese characters")
parser.add_argument("input_file", nargs="?",
type=argparse.FileType("r"), default=sys.stdin,
help="input file; default to stdin")
args = parser.parse_args()
for line in args.input_file:
if not any(ord(i) in c1 | c2 for i in line):
sys.stdout.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment