treystout/phrase_finder.py

## phrase_finder.py
#!/usr/bin/env python
"""phrase_finder.py

Find the shortest segment of haystack text containing ALL of the needle words

SOME LICENSE BLAH BLAH...

Author: Trey Stout <treystout@gmail.com>
"""
import collections
import re
import sys

def shortest_segment(needles, haystack, debug=False):
  """returns the shortest string possible found in `haystack` that contains
  every word found in the iterable `words`

  raises `RuntimeError` if one of the words isn't in haystack
  """
  # first off, sanitize the haystack
  haystack = re.sub('\s+', ' ', haystack.lower().strip())
  haystack = re.sub('[^\w]', ' ', haystack)
  haystack_words = haystack.split()

  # to stay quick we can gather the info we need in a single pass over the
  # haystack and keeping an index of the locations of needles we care about
  idx = collections.defaultdict(list)
  for i, word in enumerate(haystack_words):
    if word in needles: # words in between don't matter
      idx[word].append(i)

  # exit early if one of our needle words is absent from the input text
  for word in needles:
    if word not in idx:
      raise RuntimeError, "'s' was not found in the input text" % word

  def check_bounds(head, tail):
    """return true if start and end contain all keywords
    """
    needles_matched = 0
    for needle in needles:
      # we're just ensuring that this needle has at least 1 location between our
      # current head and tail
      if any([tail <= i <= head for i in idx[needle]]):
        needles_matched += 1
    return needles_matched == len(needles)

  # now we have a scrubbed group of haystack words, and we know the word offset
  # of every needle (stored in idx), we're going to attempt to expand our
  # segment one at a time til the segment contains all needles, when that
  # happens we will then advance our tail til the segment no longer contains all
  # the needles. The segment right before the last tail move will be a "valid"
  # segment, so we store the segment info for comparison at the end.
  head = 0
  tail = 0
  segments = []
  while head < len(haystack_words):
    head += 1
    if check_bounds(head, tail):
      if debug:
        print "head is now %d: segment is valid" % head
      #this segment is now big enough to contail all the needles, attempt to
      # shrink it now by advancing the tail til this check_bounds fails
      while tail < head:
        tail += 1
        if not check_bounds(head, tail):
          if debug:
            print "tail is now %d: segment just went invalid" % tail
          segments.append({
            'head': head,
            'tail': tail - 1,
            'len': head - tail - 1
          })
          if debug:
            print "segment marked: %s" % segments[-1]
          break

  # now we rank the segments we found by their overall length
  ordered_segments = sorted(segments, key=lambda s: s['len'])
  winner = ordered_segments[0]
  return ' '.join(haystack_words[winner['tail']:winner['head']+1])


if __name__ == "__main__":
  try:
    multiple = max(int(sys.argv[1]), 1)
  except:
    multiple = 1

  words = ["landmark", "city", "bridge"]
  haystack = """The George Washington Bridge in New York City is
  one of the oldest bridges ever constructed. It is now being
  remodeled because the bridge is a landmark. City officials say
  that the landmark bridge effort will create a lot of new jobs in
  the city.""" * multiple
  print shortest_segment(words, haystack, debug=False)
	#!/usr/bin/env python
	"""phrase_finder.py

	Find the shortest segment of haystack text containing ALL of the needle words

	SOME LICENSE BLAH BLAH...

	Author: Trey Stout <treystout@gmail.com>
	"""
	import collections
	import re
	import sys

	def shortest_segment(needles, haystack, debug=False):
	"""returns the shortest string possible found in `haystack` that contains
	every word found in the iterable `words`

	raises `RuntimeError` if one of the words isn't in haystack
	"""
	# first off, sanitize the haystack
	haystack = re.sub('\s+', ' ', haystack.lower().strip())
	haystack = re.sub('[^\w]', ' ', haystack)
	haystack_words = haystack.split()

	# to stay quick we can gather the info we need in a single pass over the
	# haystack and keeping an index of the locations of needles we care about
	idx = collections.defaultdict(list)
	for i, word in enumerate(haystack_words):
	if word in needles: # words in between don't matter
	idx[word].append(i)

	# exit early if one of our needle words is absent from the input text
	for word in needles:
	if word not in idx:
	raise RuntimeError, "'s' was not found in the input text" % word

	def check_bounds(head, tail):
	"""return true if start and end contain all keywords
	"""
	needles_matched = 0
	for needle in needles:
	# we're just ensuring that this needle has at least 1 location between our
	# current head and tail
	if any([tail <= i <= head for i in idx[needle]]):
	needles_matched += 1
	return needles_matched == len(needles)

	# now we have a scrubbed group of haystack words, and we know the word offset
	# of every needle (stored in idx), we're going to attempt to expand our
	# segment one at a time til the segment contains all needles, when that
	# happens we will then advance our tail til the segment no longer contains all
	# the needles. The segment right before the last tail move will be a "valid"
	# segment, so we store the segment info for comparison at the end.
	head = 0
	tail = 0
	segments = []
	while head < len(haystack_words):
	head += 1
	if check_bounds(head, tail):
	if debug:
	print "head is now %d: segment is valid" % head
	#this segment is now big enough to contail all the needles, attempt to
	# shrink it now by advancing the tail til this check_bounds fails
	while tail < head:
	tail += 1
	if not check_bounds(head, tail):
	if debug:
	print "tail is now %d: segment just went invalid" % tail
	segments.append({
	'head': head,
	'tail': tail - 1,
	'len': head - tail - 1
	})
	if debug:
	print "segment marked: %s" % segments[-1]
	break

	# now we rank the segments we found by their overall length
	ordered_segments = sorted(segments, key=lambda s: s['len'])
	winner = ordered_segments[0]
	return ' '.join(haystack_words[winner['tail']:winner['head']+1])


	if __name__ == "__main__":
	try:
	multiple = max(int(sys.argv[1]), 1)
	except:
	multiple = 1

	words = ["landmark", "city", "bridge"]
	haystack = """The George Washington Bridge in New York City is
	one of the oldest bridges ever constructed. It is now being
	remodeled because the bridge is a landmark. City officials say
	that the landmark bridge effort will create a lot of new jobs in
	the city.""" * multiple
	print shortest_segment(words, haystack, debug=False)