Created
November 6, 2011 19:58
-
-
Save agriffis/1343386 to your computer and use it in GitHub Desktop.
isplit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def isplit(patt, s, flags=None): | |
"""Return a generator that behaves similarly to re.split, with the | |
following differences: | |
1. It's a generator, not a list. | |
2. Zero-width separators work properly | |
(see http://bugs.python.org/issue3262) | |
3. The sequence always includes the separators, similar to calling | |
re.split(r'(patt)', s) | |
Note there will always be an odd number of elements generated, because | |
the list always starts and ends with content. | |
""" | |
kwargs = {} | |
if flags is not None: | |
kwargs['flags'] = flags | |
sepi = re.finditer(patt, s, **kwargs) | |
class FakeMatchObj(object): | |
def end(self): | |
return 0 | |
prevm, m, nextm = None, FakeMatchObj(), next(sepi, None) | |
while nextm: | |
prevm, m, nextm = m, nextm, next(sepi, None) | |
# There are two zero-width separator special cases to handle: | |
# | |
# 1. zero-width separator immediately following another separator | |
# (or the start-of-string), for example matching \b | |
# immediately after matching \s+ | |
# | |
# 2. zero-width separator matching immediately prior to another | |
# separator, for example matching \b immediately prior to | |
# matching \s+ | |
# | |
# The first case is easy to handle, see the "if...continue" below. | |
# | |
# The second case may be impossible to handle, because finditer | |
# seems to consider the matches to be overlapping in that case | |
# (presumably because they both start at the same cursor position, | |
# even though the zero-width case doesn't consume any characters). | |
# Therefore we include a loop to handle this second case, but | |
# it is probably ineffective and in fact the only solution is | |
# for the user to order their alternatives properly: | |
# r'\s+|\b' rather than r'\b|\s+' | |
if m.start() == m.end() == prevm.end(): | |
# Skip a zero-width separator immediately following | |
# another separator (or start-of-string). | |
continue | |
while nextm and m.start() == m.end() == nextm.start(): | |
# Try to find a non-zero width separator at this point | |
# before accepting this one. (but see the note above) | |
m, nextm = nextm, next(sepi, None) | |
# Yield the content prior to this separator. | |
yield s[prevm.end():m.start()] | |
if m.start() == len(s): | |
# Don't yield the end-of-string as a zero-length | |
# separator. We're done. | |
return | |
# Yield this separator. | |
yield s[m.start():m.end()] | |
# There's always content following the last separator. | |
yield s[m.end():] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment