Created
May 23, 2016 07:26
-
-
Save JokerQyou/fc609f90e40ae1ca9b884e29da921aef to your computer and use it in GitHub Desktop.
Regular expression to split by punctuation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import print_function | |
import re | |
pattern = u''.join([ | |
u'[', | |
u'\u0020-\u002f', # < General Latin characters, exclude @, letters and numbers | |
u'\u003A-\u003f', | |
u'\u005b-\u0060', | |
u'\u007b-\u007f', # > See http://jrgraphix.net/r/Unicode/0020-007F for details | |
u'\u2000-\u206f', # General Punctuation | |
u'\u2e00-\u2e7f', # Supplemental Punctuation | |
u'\u3000-\u303f', # CJK Symbols and Punctuation | |
u'\uff00-\uffef', # Halfwidth and Fullwidth Forms | |
u'\ufff0-\uffff', # Specials | |
u']+', | |
]) | |
# The above form expalins how the pattern works, but you can use this equivalent too | |
# u'[ -/:-?[-`{-\x7f\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f\uff00-\uffef\ufff0-\uffff]+' | |
a = u'''@测试员1,关注一下这个。@测试员2?@测试员3!@测试员4,@tester5:你好 @其他人 | |
@更多人,换行带了空格 | |
@还有谁?换行不带空格''' | |
[print(i) for i in re.split(pattern, a)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment