Last active
May 25, 2023 09:16
-
-
Save Miopas/455f634f5b1f4303ea8d7d65f02ec1d8 to your computer and use it in GitHub Desktop.
去除文本中各种符号的python脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf8 | |
import string | |
import sys | |
import re | |
def conv_wide(chr): | |
#全角转半角 | |
code = ord(chr) | |
if chr == u'\u3000': #space | |
code = 32 | |
elif u'\uff01' <= chr <= u'\uff5e': | |
code -= 65248 | |
return unichr(code) | |
def is_symbol(chr): | |
#常用标点符号 | |
if u'\u2000' <= chr <= u'\u206f': | |
return True | |
#CJK标点符号 e.g.〖〗 | |
if u'\u3000' <= chr <= u'\u303f' or u'\ufe30' <= chr <= u'\ufe4f' or u'\ufe10' <= chr <= u'\ufe1f': | |
return True | |
#注音符号 e.g.ㄡ | |
if u'\u3100' <= chr <= u'\u312f': | |
return True | |
#装饰符号和杂项符号 e.g.⚽' | |
if u'\u2600' <= chr <= u'\u26ff' or u'\u2700' <= chr <= u'\u27bf': | |
return True | |
return False | |
def clear_symbol(text): | |
res = '' | |
try: | |
text = text.decode('utf8') | |
except: | |
print_counter('utils_err', 'clear_symbol', 1) | |
return '' | |
for chr in text: | |
chr = conv_wide(chr) | |
if is_symbol(chr): | |
continue | |
res += chr | |
res = res.encode('utf8').translate(None, string.punctuation) # 去除英文标点 | |
res = res.rstrip() | |
res = res.lstrip() | |
return res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment