Skip to content

Instantly share code, notes, and snippets.

@Miopas
Created November 16, 2018 02:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Miopas/0c87579e9817e4e34c265434a7beced0 to your computer and use it in GitHub Desktop.
Save Miopas/0c87579e9817e4e34c265434a7beced0 to your computer and use it in GitHub Desktop.
clear various symbol
#coding=utf8
import string
import sys
import re
def conv_wide(char):
#全角转半角
code = ord(char)
if char == '\u3000': #space
code = 32
elif '\uff01' <= char <= '\uff5e':
code -= 65248
return chr(code)
def is_symbol(char):
#常用标点符号
if '\u2000' <= char <= '\u206f':
return True
#CJK标点符号 e.g.〖〗
if '\u3000' <= char <= '\u303f' or '\ufe30' <= char <= '\ufe4f' or '\ufe10' <= char <= '\ufe1f':
return True
#注音符号 e.g.ㄡ
if '\u3100' <= char <= '\u312f':
return True
#装饰符号和杂项符号 e.g.⚽'
if '\u2700' <= char <= '\u27bf':
return True
# Latin-1
if '\u00A0' <= char <= '\u00FF':
return True
# Geometric Shapes
if '\u25A0' <= char <= '\u25FF':
return True
# Miscellaneous Symbols
if '\u2600' <= char <= '\u26FF':
return True
# Cyrillic
if '\u0400' <= char <= '\u04FF':
return True
# Mathematical symbols
if '\u2220' <= char <= '\u22FF':
return True
# Mathematical symbols
if '\u2220' <= char <= '\u22FF':
return True
# Japanese
if '\u3040' <= char <= '\u309F' or '\u30A0' <= char <= '\u30FF' :
return True
return False
def clear_symbol(text):
res = ''
for char in text:
char = conv_wide(char)
if is_symbol(char):
continue
res += char
#res = res.encode('utf8').translate(None, string.punctuation)
res = res.translate(str.maketrans('', '', string.punctuation)) # 去除英文标点
res = res.rstrip()
res = res.lstrip()
return res
if __name__ == "__main__":
text = "✿✿ヽ(°▽°)ノ✿中文测试@!@#%!!_(:з」∠)_ð"
print(clear_symbol(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment