Last active
May 6, 2019 13:36
-
-
Save btspoony/a06f2e188d697efb4a3e8a544b7b4d39 to your computer and use it in GitHub Desktop.
筛选文件中的中文字符
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Test | |
""" | |
import sys | |
import os | |
import re | |
import csv | |
BASE_PATH = os.path.dirname(os.path.abspath(__file__)) | |
DEFAULT_FILE = "./data.sql" | |
def haschinese(name): | |
""" | |
检查中文情况 | |
""" | |
str_utf8 = name.decode('utf-8') | |
for k in str_utf8: | |
if u'\u4e00' <= k <= u'\u9fff': | |
return True | |
return False | |
def help_args(): | |
""" | |
Print Help String | |
""" | |
print "请输入要转换的文件名" | |
def arg_check(key): | |
""" | |
check arguments | |
""" | |
return key in ["-f"] | |
def arg_value(argv, key): | |
""" | |
get arguments value | |
""" | |
values = [] | |
idx = argv.index(key) | |
for i in range(idx + 1, len(argv)): | |
if arg_check(argv[i]): | |
break | |
values.append(argv[i]) | |
return values | |
def arg_set(argv, args): | |
""" | |
set argument value | |
""" | |
if "-f" in argv: | |
args["file"] = arg_value(argv, "-f") | |
def arg_parse(): | |
""" | |
parse arguments | |
""" | |
argv = sys.argv | |
if "-h" in argv: | |
return None | |
args = { | |
"file":[DEFAULT_FILE] | |
} | |
arg_set(argv, args) | |
return args | |
def find_in_line(line, reg, libs): | |
""" | |
搜索函数 | |
""" | |
result = re.findall(reg, line, flags=re.U) | |
if not result: | |
return | |
for word in result: | |
if re.match(r"^[\[\{].*", word, flags=re.U): | |
find_in_line(word, r"\\\"([^\\]+)\\\"", libs) | |
else: | |
if haschinese(word): | |
libs.add(word) | |
def main(): | |
""" | |
运行函数 | |
""" | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
args = arg_parse() | |
if not args: | |
return help_args() | |
for k in args["file"]: | |
file_path = os.path.normpath(os.path.join(BASE_PATH, k)) | |
libs = set() | |
file_opened = open(file_path, "r") | |
for line in file_opened.readlines(): | |
find_in_line(line, r",'([^']+)'", libs) | |
file_opened.close() | |
# 所有找到的中文 | |
file_write = open(file_path+'.csv', 'w') | |
file_write.write("from,to\n") | |
for k in libs: | |
file_write.write(k+",\n") | |
file_write.close() | |
# 运行 | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment