ericflo/re_parts.py

## re_parts.py
# Extracted from django-oembed, because I keep having a need for this snippet

import re
from heapq import heappush, heappop

def re_parts(regex_list, text):
    """
    An iterator that returns the entire text, but split by which regex it
    matched, or none at all.  If it did, the first value of the returned tuple
    is the index into the regex list, otherwise -1.

    >>> first_re = re.compile('asdf')
    >>> second_re = re.compile('an')
    >>> list(re_parts([first_re, second_re], 'This is an asdf test.'))
    [(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]

    >>> list(re_parts([first_re, second_re], 'asdfasdfasdf'))
    [(0, 'asdf'), (0, 'asdf'), (0, 'asdf')]

    >>> list(re_parts([], 'This is an asdf test.'))
    [(-1, 'This is an asdf test.')]

    >>> third_re = re.compile('sdf')
    >>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.'))
    [(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
    """
    def match_compare(x, y):
        return x.start() - y.start()
    prev_end = 0
    iter_dict = dict((r, r.finditer(text)) for r in regex_list)

    # a heapq containing matches
    matches = []

    # bootstrap the search with the first hit for each iterator
    for regex, iterator in iter_dict.items():
        try:
            match = iterator.next()
            heappush(matches, (match.start(), match))
        except StopIteration:
            iter_dict.pop(regex)

    # process matches, revisiting each iterator from which a match is used
    while matches:
        # get the earliest match
        start, match = heappop(matches)
        end = match.end()
        if start > prev_end:
            # yield the text from current location to start of match
            yield (-1, text[prev_end:start])
        # yield the match
        yield (regex_list.index(match.re), text[start:end])
        # get the next match from the iterator for this match
        if match.re in iter_dict:
            try:
                newmatch = iter_dict[match.re].next()
                heappush(matches, (newmatch.start(), newmatch))
            except StopIteration:
                iter_dict.pop(match.re)
        prev_end = end

    # yield text from end of last match to end of text
    last_bit = text[prev_end:]
    if len(last_bit) > 0:
        yield (-1, last_bit)
	# Extracted from django-oembed, because I keep having a need for this snippet

	import re
	from heapq import heappush, heappop

	def re_parts(regex_list, text):
	"""
	An iterator that returns the entire text, but split by which regex it
	matched, or none at all. If it did, the first value of the returned tuple
	is the index into the regex list, otherwise -1.

	>>> first_re = re.compile('asdf')
	>>> second_re = re.compile('an')
	>>> list(re_parts([first_re, second_re], 'This is an asdf test.'))
	[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]

	>>> list(re_parts([first_re, second_re], 'asdfasdfasdf'))
	[(0, 'asdf'), (0, 'asdf'), (0, 'asdf')]

	>>> list(re_parts([], 'This is an asdf test.'))
	[(-1, 'This is an asdf test.')]

	>>> third_re = re.compile('sdf')
	>>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.'))
	[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
	"""
	def match_compare(x, y):
	return x.start() - y.start()
	prev_end = 0
	iter_dict = dict((r, r.finditer(text)) for r in regex_list)

	# a heapq containing matches
	matches = []

	# bootstrap the search with the first hit for each iterator
	for regex, iterator in iter_dict.items():
	try:
	match = iterator.next()
	heappush(matches, (match.start(), match))
	except StopIteration:
	iter_dict.pop(regex)

	# process matches, revisiting each iterator from which a match is used
	while matches:
	# get the earliest match
	start, match = heappop(matches)
	end = match.end()
	if start > prev_end:
	# yield the text from current location to start of match
	yield (-1, text[prev_end:start])
	# yield the match
	yield (regex_list.index(match.re), text[start:end])
	# get the next match from the iterator for this match
	if match.re in iter_dict:
	try:
	newmatch = iter_dict[match.re].next()
	heappush(matches, (newmatch.start(), newmatch))
	except StopIteration:
	iter_dict.pop(match.re)
	prev_end = end

	# yield text from end of last match to end of text
	last_bit = text[prev_end:]
	if len(last_bit) > 0:
	yield (-1, last_bit)