Created
January 22, 2014 18:43
-
-
Save treystout/8564705 to your computer and use it in GitHub Desktop.
A shot at finding smallest substring of a larger string containing all words in the needle phrase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""phrase_finder.py | |
Find the shortest segment of haystack text containing ALL of the needle words | |
SOME LICENSE BLAH BLAH... | |
Author: Trey Stout <treystout@gmail.com> | |
""" | |
import itertools | |
import re | |
import sys | |
def shortest_segment(words, haystack, debug=False): | |
"""returns the shortest string possible found in `haystack` that contains | |
every word found in the iterable `words` | |
raises `RuntimeError` if one of the words isn't in haystack | |
""" | |
NEEDLE_TMPL = r'\b%s\b' # a regex template with word boundaries | |
# sanitize our haystack a bit | |
haystack = re.sub('\s+', ' ', haystack.lower().strip()) | |
# exit early if one of our needle words is absent from the input text | |
for word in words: | |
needle = NEEDLE_TMPL % word | |
if not re.search(needle, haystack): | |
raise RuntimeError, "'s' was not found in the input text" % word | |
# construct a regular expression that's basically just an OR of each of our | |
# needle words. Ensure each word is separated by word boundaries. | |
needle = "|".join(NEEDLE_TMPL % w for w in words) | |
# create an index of every match of every word | |
all_matches = {w: [] for w in words} | |
# now get a match object for every word, group duplicates based on word | |
for m in re.finditer(needle, haystack): | |
all_matches[m.group()].append(m) | |
# create all permutations of matches (avoiding duplicate words) | |
product = itertools.product(*all_matches.values()) | |
segments = [] # we'll fill this with candidate segments | |
for permutation in product: | |
# order this permuation's matches by start point | |
ordered = sorted(permutation, key=lambda m: m.start()) | |
if debug: | |
print "checking permutation:", \ | |
["%d:%s" % (p.start(), p.group()) for p in ordered] | |
# assume worst-case for our boundaries | |
start = len(haystack) | |
end = 0 | |
# now shrink the bounds based on this permutation | |
for match in ordered: | |
if debug: | |
print "looking at", match.group(), match.start() | |
if match.start() < start: | |
start = match.start() | |
if match.end() > end: | |
end = match.end() | |
segments.append(haystack[start:end+1]) | |
# order the segments by their overall length | |
ordered_segments = sorted(segments, key=len) | |
if debug: | |
print "segments found", ordered_segments | |
return ordered_segments[0] | |
if __name__ == "__main__": | |
#print shortest_segment(['dog'], "dog eat dog") | |
#print shortest_segment(['dog'], "dog cat bird") | |
#print shortest_segment(['dog', 'cat'], "dog cat bird") | |
#print shortest_segment(['dog', 'bird'], "dog cat bird") | |
#print shortest_segment(['weird', 'case'], "weird would be what I would call " | |
# "this weird case") | |
#words = ["new", "york", "democrats"] | |
#haystack = """It was the latest twist in a battle that has | |
#captivated New York politics, with consequences that could reach | |
#far beyond the issue of childhood education, and a clear signal | |
#that the tension between the state's two most powerful Democrats | |
#is not likely to abate any time soon. | |
#""" | |
#print shortest_segment(words, haystack, debug=True) | |
words = ["landmark", "city", "bridge"] | |
haystack = """The George Washington Bridge in New York City is | |
one of the oldest bridges ever constructed. It is now being | |
remodeled because the bridge is a landmark. City officials say | |
that the landmark bridge effort will create a lot of new jobs in | |
the city.""" | |
print shortest_segment(words, haystack, debug=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some potential issues: some limit should probably be set on the number of words you can put in the needle list. Each needle word is checked individually against the haystack to ensure it exists. So given giant needles or giant haystacks or tons of needles you could exhaust resources quickly.
It does handle dupes fairly well. It also keeps an index of all matches with their locations, so if you wanted to return the offset as well as the substring, that would be a minor change.
If the itertools.product() call is considered cheating, it wouldn't be hard to implement that.