Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active June 4, 2022 13:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xflr6/c0e28a7a8ee4e79b571d24682805ba94 to your computer and use it in GitHub Desktop.
Save xflr6/c0e28a7a8ee4e79b571d24682805ba94 to your computer and use it in GitHub Desktop.
Split a string into chunks by a pattern matching at the start of each item
"""Split a string into chunks by a pattern matching at the start of each item.
>>> list(itersplit(r'!', 'spam !eggs !ham'))
['spam ', '!eggs ', '!ham']
>>> list(itersplit(r'X', 'spam !eggs !ham'))
['spam !eggs !ham']
>>> list(itersplit(r'!', '!spam !eggs !ham'))
['', '!spam ', '!eggs ', '!ham']
>>> list(itersplit(r'!', '!spam !eggs !ham', no_empty=True))
['!spam ', '!eggs ', '!ham']
>>> re.findall(r'(?s).+?(?=!|\Z)', 'spam !eggs !ham')
['spam ', '!eggs ', '!ham']
>>> re.findall(r'(?s).+?(?=X|\Z)', 'spam !eggs !ham')
['spam !eggs !ham']
>>> re.findall(r'(?s).+?(?=!|\Z)', '!spam !eggs !ham')
['!spam ', '!eggs ', '!ham']
"""
from collections.abc import Iterator
import itertools
import re
def itersplit_(start_pattern: str, string: str, *,
flags: int = 0) -> Iterator[str]:
starts = (ma.start() for ma in re.finditer(start_pattern, string, flags))
starts = itertools.chain([0], starts, [len(string)])
starts, ends = itertools.tee(starts)
next(ends)
return (string[s:e] for s, e in itertools.izip(starts, ends))
def itersplit(start_pattern, string, *,
no_empty: bool = False,
flags: int = 0) -> Iterator[str]:
matches = re.finditer(start_pattern, string, flags)
try:
ma = next(matches)
except StopIteration:
yield string
return
pos = ma.start()
if pos or not no_empty:
yield string[:pos]
for ma in matches:
end = ma.start()
yield string[pos:end]
pos = end
yield string[end:]
if __name__ == '__main__':
import doctest
doctest.testmod()
"""Split a string into chunks by a string matching at the start of each item."""
from collections.abc import Iterator
import functools
def finditer(sep: str, string: str, *,
no_empty: bool = False) -> Iterator[str]:
"""
>>> list(finditer('!', 'spam !eggs !ham'))
['spam ', '!eggs ', '!ham']
>>> list(finditer('X', 'spam !eggs !ham'))
['spam !eggs !ham']
>>> list(finditer('!', '!spam !eggs !ham'))
['', '!spam ', '!eggs ', '!ham']
>>> list(finditer('!', '!spam !eggs !ham', no_empty=True))
['!spam ', '!eggs ', '!ham']
"""
s = len(sep)
find = functools.partial(string.find, sep)
i = find()
if i < 0:
yield string
return
elif i == 0 and no_empty:
i = find(s)
yield string[:i]
n = find(i + s)
while n > 0:
yield string[i:n]
i, n = n, find(n + s)
yield string[i:]
def finditer_caret(sep: str, string: str, *,
no_empty: bool = False) -> Iterator[str]:
r"""
>>> list(finditer_caret('!', 'spam\n!eggs\n!ham'))
['spam\n', '!eggs\n', '!ham']
>>> list(finditer_caret('X', 'spam\n!eggs\n!ham'))
['spam\n!eggs\n!ham']
>>> list(finditer_caret('!', '!spam\n!eggs\n!ham'))
['', '!spam\n', '!eggs\n', '!ham']
>>> list(finditer_caret('!', '!spam\n!eggs\n!ham', no_empty=True))
['!spam\n', '!eggs\n', '!ham']
>>> list(finditer_caret('!', '\n!spam\n!eggs\n!ham'))
['\n', '!spam\n', '!eggs\n', '!ham']
"""
find = functools.partial(string.find, '\n' + sep)
if string.find(sep) == 0:
if not no_empty:
yield string[:0]
i = find(len(sep))
else:
i = find()
if i < 0:
yield string
return
s = 1 + len(sep)
yield string[:i + 1]
n = find(i + s)
while n > 0:
yield string[i + 1:n + 1]
i, n = n, find(n + s)
yield string[i + 1:]
def itersplit(sep: str, string: str, *,
no_empty: bool = False) -> Iterator[str]:
r"""
>>> list(itersplit('!', 'spam !eggs !ham'))
['spam ', '!eggs ', '!ham']
>>> list(itersplit('^!', 'spam\n!eggs\n!ham'))
['spam\n', '!eggs\n', '!ham']
>>> list(itersplit('X', 'spam !eggs !ham'))
['spam !eggs !ham']
>>> list(itersplit('^X', 'spam\n!eggs\n!ham'))
['spam\n!eggs\n!ham']
>>> list(itersplit('!', '!spam !eggs !ham'))
['', '!spam ', '!eggs ', '!ham']
>>> list(itersplit('^!', '!spam\n!eggs\n!ham'))
['', '!spam\n', '!eggs\n', '!ham']
>>> list(itersplit('!', '!spam !eggs !ham', no_empty=True))
['!spam ', '!eggs ', '!ham']
>>> list(itersplit('^!', '!spam\n!eggs\n!ham', no_empty=True))
['!spam\n', '!eggs\n', '!ham']
"""
sep, caret = (sep[1:], True) if sep.startswith('^') else (sep, False)
s = caret + len(sep)
if caret:
find = functools.partial(string.find, '\n' + sep)
if string.find(sep) == 0:
if not no_empty:
yield string[:0]
i = find(len(sep))
else:
i = find()
else:
find = functools.partial(string.find, sep)
i = find()
if i == 0 and no_empty:
i = find(s)
if i < 0:
yield string
return
yield string[:i + caret]
n = find(i + s)
while n > 0:
yield string[i + caret:n + caret]
i, n = n, find(n + s)
yield string[i + caret:]
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment