Created
November 20, 2010 23:18
-
-
Save ericflo/708262 to your computer and use it in GitHub Desktop.
An iterator which operates like re.split, except it takes a list of regexes, and lets you know which regex is responsible for each split (or whether that part of the split was the result of not matching any regex)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extracted from django-oembed, because I keep having a need for this snippet | |
import re | |
from heapq import heappush, heappop | |
def re_parts(regex_list, text): | |
""" | |
An iterator that returns the entire text, but split by which regex it | |
matched, or none at all. If it did, the first value of the returned tuple | |
is the index into the regex list, otherwise -1. | |
>>> first_re = re.compile('asdf') | |
>>> second_re = re.compile('an') | |
>>> list(re_parts([first_re, second_re], 'This is an asdf test.')) | |
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')] | |
>>> list(re_parts([first_re, second_re], 'asdfasdfasdf')) | |
[(0, 'asdf'), (0, 'asdf'), (0, 'asdf')] | |
>>> list(re_parts([], 'This is an asdf test.')) | |
[(-1, 'This is an asdf test.')] | |
>>> third_re = re.compile('sdf') | |
>>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.')) | |
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')] | |
""" | |
def match_compare(x, y): | |
return x.start() - y.start() | |
prev_end = 0 | |
iter_dict = dict((r, r.finditer(text)) for r in regex_list) | |
# a heapq containing matches | |
matches = [] | |
# bootstrap the search with the first hit for each iterator | |
for regex, iterator in iter_dict.items(): | |
try: | |
match = iterator.next() | |
heappush(matches, (match.start(), match)) | |
except StopIteration: | |
iter_dict.pop(regex) | |
# process matches, revisiting each iterator from which a match is used | |
while matches: | |
# get the earliest match | |
start, match = heappop(matches) | |
end = match.end() | |
if start > prev_end: | |
# yield the text from current location to start of match | |
yield (-1, text[prev_end:start]) | |
# yield the match | |
yield (regex_list.index(match.re), text[start:end]) | |
# get the next match from the iterator for this match | |
if match.re in iter_dict: | |
try: | |
newmatch = iter_dict[match.re].next() | |
heappush(matches, (newmatch.start(), newmatch)) | |
except StopIteration: | |
iter_dict.pop(match.re) | |
prev_end = end | |
# yield text from end of last match to end of text | |
last_bit = text[prev_end:] | |
if len(last_bit) > 0: | |
yield (-1, last_bit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment