Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Regular expression to split by punctuation
# coding: utf-8
from __future__ import print_function
import re
pattern = u''.join([
u'[',
u'\u0020-\u002f', # < General Latin characters, exclude @, letters and numbers
u'\u003A-\u003f',
u'\u005b-\u0060',
u'\u007b-\u007f', # > See http://jrgraphix.net/r/Unicode/0020-007F for details
u'\u2000-\u206f', # General Punctuation
u'\u2e00-\u2e7f', # Supplemental Punctuation
u'\u3000-\u303f', # CJK Symbols and Punctuation
u'\uff00-\uffef', # Halfwidth and Fullwidth Forms
u'\ufff0-\uffff', # Specials
u']+',
])
# The above form expalins how the pattern works, but you can use this equivalent too
# u'[ -/:-?[-`{-\x7f\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f\uff00-\uffef\ufff0-\uffff]+'
a = u'''@测试员1,关注一下这个。@测试员2?@测试员3!@测试员4,@tester5:你好 @其他人
@更多人,换行带了空格
@还有谁?换行不带空格'''
[print(i) for i in re.split(pattern, a)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment