Created
January 23, 2014 15:05
-
-
Save treystout/8580000 to your computer and use it in GitHub Desktop.
Revamped the algo to only go through haystack a single time. All segment detection is then done using a single word-location index. It sort of inch-worms down the haystack, marking valid "contractions" then returns the shortest one found.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""phrase_finder.py | |
Find the shortest segment of haystack text containing ALL of the needle words | |
SOME LICENSE BLAH BLAH... | |
Author: Trey Stout <treystout@gmail.com> | |
""" | |
import collections | |
import re | |
import sys | |
def shortest_segment(needles, haystack, debug=False): | |
"""returns the shortest string possible found in `haystack` that contains | |
every word found in the iterable `words` | |
raises `RuntimeError` if one of the words isn't in haystack | |
""" | |
# first off, sanitize the haystack | |
haystack = re.sub('\s+', ' ', haystack.lower().strip()) | |
haystack = re.sub('[^\w]', ' ', haystack) | |
haystack_words = haystack.split() | |
# to stay quick we can gather the info we need in a single pass over the | |
# haystack and keeping an index of the locations of needles we care about | |
idx = collections.defaultdict(list) | |
for i, word in enumerate(haystack_words): | |
if word in needles: # words in between don't matter | |
idx[word].append(i) | |
# exit early if one of our needle words is absent from the input text | |
for word in needles: | |
if word not in idx: | |
raise RuntimeError, "'s' was not found in the input text" % word | |
def check_bounds(head, tail): | |
"""return true if start and end contain all keywords | |
""" | |
needles_matched = 0 | |
for needle in needles: | |
# we're just ensuring that this needle has at least 1 location between our | |
# current head and tail | |
if any([tail <= i <= head for i in idx[needle]]): | |
needles_matched += 1 | |
return needles_matched == len(needles) | |
# now we have a scrubbed group of haystack words, and we know the word offset | |
# of every needle (stored in idx), we're going to attempt to expand our | |
# segment one at a time til the segment contains all needles, when that | |
# happens we will then advance our tail til the segment no longer contains all | |
# the needles. The segment right before the last tail move will be a "valid" | |
# segment, so we store the segment info for comparison at the end. | |
head = 0 | |
tail = 0 | |
segments = [] | |
while head < len(haystack_words): | |
head += 1 | |
if check_bounds(head, tail): | |
if debug: | |
print "head is now %d: segment is valid" % head | |
#this segment is now big enough to contail all the needles, attempt to | |
# shrink it now by advancing the tail til this check_bounds fails | |
while tail < head: | |
tail += 1 | |
if not check_bounds(head, tail): | |
if debug: | |
print "tail is now %d: segment just went invalid" % tail | |
segments.append({ | |
'head': head, | |
'tail': tail - 1, | |
'len': head - tail - 1 | |
}) | |
if debug: | |
print "segment marked: %s" % segments[-1] | |
break | |
# now we rank the segments we found by their overall length | |
ordered_segments = sorted(segments, key=lambda s: s['len']) | |
winner = ordered_segments[0] | |
return ' '.join(haystack_words[winner['tail']:winner['head']+1]) | |
if __name__ == "__main__": | |
try: | |
multiple = max(int(sys.argv[1]), 1) | |
except: | |
multiple = 1 | |
words = ["landmark", "city", "bridge"] | |
haystack = """The George Washington Bridge in New York City is | |
one of the oldest bridges ever constructed. It is now being | |
remodeled because the bridge is a landmark. City officials say | |
that the landmark bridge effort will create a lot of new jobs in | |
the city.""" * multiple | |
print shortest_segment(words, haystack, debug=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
1119 10:03:38 trey@lucky [~/code/jagerphraser/8564705] J:0 ⎆ time python phrase_finder.py 40
bridge is a landmark city
real 0m0.176s
user 0m0.164s
sys 0m0.010s