Skip to content

Instantly share code, notes, and snippets.

@Miopas
Last active May 25, 2023 09:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Miopas/455f634f5b1f4303ea8d7d65f02ec1d8 to your computer and use it in GitHub Desktop.
Save Miopas/455f634f5b1f4303ea8d7d65f02ec1d8 to your computer and use it in GitHub Desktop.
去除文本中各种符号的python脚本
#coding=utf8
import string
import sys
import re
def conv_wide(chr):
#全角转半角
code = ord(chr)
if chr == u'\u3000': #space
code = 32
elif u'\uff01' <= chr <= u'\uff5e':
code -= 65248
return unichr(code)
def is_symbol(chr):
#常用标点符号
if u'\u2000' <= chr <= u'\u206f':
return True
#CJK标点符号 e.g.〖〗
if u'\u3000' <= chr <= u'\u303f' or u'\ufe30' <= chr <= u'\ufe4f' or u'\ufe10' <= chr <= u'\ufe1f':
return True
#注音符号 e.g.ㄡ
if u'\u3100' <= chr <= u'\u312f':
return True
#装饰符号和杂项符号 e.g.⚽'
if u'\u2600' <= chr <= u'\u26ff' or u'\u2700' <= chr <= u'\u27bf':
return True
return False
def clear_symbol(text):
res = ''
try:
text = text.decode('utf8')
except:
print_counter('utils_err', 'clear_symbol', 1)
return ''
for chr in text:
chr = conv_wide(chr)
if is_symbol(chr):
continue
res += chr
res = res.encode('utf8').translate(None, string.punctuation) # 去除英文标点
res = res.rstrip()
res = res.lstrip()
return res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment