Skip to content

Instantly share code, notes, and snippets.

@treystout
Created January 23, 2014 15:05
Show Gist options
  • Save treystout/8580000 to your computer and use it in GitHub Desktop.
Save treystout/8580000 to your computer and use it in GitHub Desktop.
Revamped the algo to only go through haystack a single time. All segment detection is then done using a single word-location index. It sort of inch-worms down the haystack, marking valid "contractions" then returns the shortest one found.
#!/usr/bin/env python
"""phrase_finder.py
Find the shortest segment of haystack text containing ALL of the needle words
SOME LICENSE BLAH BLAH...
Author: Trey Stout <treystout@gmail.com>
"""
import collections
import re
import sys
def shortest_segment(needles, haystack, debug=False):
"""returns the shortest string possible found in `haystack` that contains
every word found in the iterable `words`
raises `RuntimeError` if one of the words isn't in haystack
"""
# first off, sanitize the haystack
haystack = re.sub('\s+', ' ', haystack.lower().strip())
haystack = re.sub('[^\w]', ' ', haystack)
haystack_words = haystack.split()
# to stay quick we can gather the info we need in a single pass over the
# haystack and keeping an index of the locations of needles we care about
idx = collections.defaultdict(list)
for i, word in enumerate(haystack_words):
if word in needles: # words in between don't matter
idx[word].append(i)
# exit early if one of our needle words is absent from the input text
for word in needles:
if word not in idx:
raise RuntimeError, "'s' was not found in the input text" % word
def check_bounds(head, tail):
"""return true if start and end contain all keywords
"""
needles_matched = 0
for needle in needles:
# we're just ensuring that this needle has at least 1 location between our
# current head and tail
if any([tail <= i <= head for i in idx[needle]]):
needles_matched += 1
return needles_matched == len(needles)
# now we have a scrubbed group of haystack words, and we know the word offset
# of every needle (stored in idx), we're going to attempt to expand our
# segment one at a time til the segment contains all needles, when that
# happens we will then advance our tail til the segment no longer contains all
# the needles. The segment right before the last tail move will be a "valid"
# segment, so we store the segment info for comparison at the end.
head = 0
tail = 0
segments = []
while head < len(haystack_words):
head += 1
if check_bounds(head, tail):
if debug:
print "head is now %d: segment is valid" % head
#this segment is now big enough to contail all the needles, attempt to
# shrink it now by advancing the tail til this check_bounds fails
while tail < head:
tail += 1
if not check_bounds(head, tail):
if debug:
print "tail is now %d: segment just went invalid" % tail
segments.append({
'head': head,
'tail': tail - 1,
'len': head - tail - 1
})
if debug:
print "segment marked: %s" % segments[-1]
break
# now we rank the segments we found by their overall length
ordered_segments = sorted(segments, key=lambda s: s['len'])
winner = ordered_segments[0]
return ' '.join(haystack_words[winner['tail']:winner['head']+1])
if __name__ == "__main__":
try:
multiple = max(int(sys.argv[1]), 1)
except:
multiple = 1
words = ["landmark", "city", "bridge"]
haystack = """The George Washington Bridge in New York City is
one of the oldest bridges ever constructed. It is now being
remodeled because the bridge is a landmark. City officials say
that the landmark bridge effort will create a lot of new jobs in
the city.""" * multiple
print shortest_segment(words, haystack, debug=False)
@treystout
Copy link
Author

1119 10:03:38 trey@lucky [~/code/jagerphraser/8564705] J:0 ⎆ time python phrase_finder.py 40
bridge is a landmark city

real 0m0.176s
user 0m0.164s
sys 0m0.010s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment