Skip to content

Instantly share code, notes, and snippets.

@ericflo
Created November 20, 2010 23:18
Show Gist options
  • Save ericflo/708262 to your computer and use it in GitHub Desktop.
Save ericflo/708262 to your computer and use it in GitHub Desktop.
An iterator which operates like re.split, except it takes a list of regexes, and lets you know which regex is responsible for each split (or whether that part of the split was the result of not matching any regex)
# Extracted from django-oembed, because I keep having a need for this snippet
import re
from heapq import heappush, heappop
def re_parts(regex_list, text):
"""
An iterator that returns the entire text, but split by which regex it
matched, or none at all. If it did, the first value of the returned tuple
is the index into the regex list, otherwise -1.
>>> first_re = re.compile('asdf')
>>> second_re = re.compile('an')
>>> list(re_parts([first_re, second_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
>>> list(re_parts([first_re, second_re], 'asdfasdfasdf'))
[(0, 'asdf'), (0, 'asdf'), (0, 'asdf')]
>>> list(re_parts([], 'This is an asdf test.'))
[(-1, 'This is an asdf test.')]
>>> third_re = re.compile('sdf')
>>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
"""
def match_compare(x, y):
return x.start() - y.start()
prev_end = 0
iter_dict = dict((r, r.finditer(text)) for r in regex_list)
# a heapq containing matches
matches = []
# bootstrap the search with the first hit for each iterator
for regex, iterator in iter_dict.items():
try:
match = iterator.next()
heappush(matches, (match.start(), match))
except StopIteration:
iter_dict.pop(regex)
# process matches, revisiting each iterator from which a match is used
while matches:
# get the earliest match
start, match = heappop(matches)
end = match.end()
if start > prev_end:
# yield the text from current location to start of match
yield (-1, text[prev_end:start])
# yield the match
yield (regex_list.index(match.re), text[start:end])
# get the next match from the iterator for this match
if match.re in iter_dict:
try:
newmatch = iter_dict[match.re].next()
heappush(matches, (newmatch.start(), newmatch))
except StopIteration:
iter_dict.pop(match.re)
prev_end = end
# yield text from end of last match to end of text
last_bit = text[prev_end:]
if len(last_bit) > 0:
yield (-1, last_bit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment