treystout/phrase_finder.py

## phrase_finder.py
#!/usr/bin/env python
"""phrase_finder.py

Find the shortest segment of haystack text containing ALL of the needle words

SOME LICENSE BLAH BLAH...

Author: Trey Stout <treystout@gmail.com>
"""
import itertools
import re
import sys


def shortest_segment(words, haystack, debug=False):
  """returns the shortest string possible found in `haystack` that contains
  every word found in the iterable `words`

  raises `RuntimeError` if one of the words isn't in haystack
  """
  NEEDLE_TMPL = r'\b%s\b' # a regex template with word boundaries
  # sanitize our haystack a bit
  haystack = re.sub('\s+', ' ', haystack.lower().strip())

  # exit early if one of our needle words is absent from the input text
  for word in words:
    needle = NEEDLE_TMPL % word
    if not re.search(needle, haystack):
      raise RuntimeError, "'s' was not found in the input text" % word

  # construct a regular expression that's basically just an OR of each of our
  # needle words. Ensure each word is separated by word boundaries.
  needle = "|".join(NEEDLE_TMPL % w for w in words)

  # create an index of every match of every word
  all_matches = {w: [] for w in words}

  # now get a match object for every word, group duplicates based on word
  for m in re.finditer(needle, haystack):
    all_matches[m.group()].append(m)

  # create all permutations of matches (avoiding duplicate words)
  product = itertools.product(*all_matches.values())
  segments = [] # we'll fill this with candidate segments
  for permutation in product:
    # order this permuation's matches by start point
    ordered = sorted(permutation, key=lambda m: m.start())
    if debug:
      print "checking permutation:", \
          ["%d:%s" % (p.start(), p.group()) for p in ordered]

    # assume worst-case for our boundaries
    start = len(haystack)
    end = 0

    # now shrink the bounds based on this permutation
    for match in ordered:
      if debug:
        print "looking at", match.group(), match.start()
      if match.start() < start:
        start = match.start()
      if match.end() > end:
        end = match.end()
    segments.append(haystack[start:end+1])

  # order the segments by their overall length
  ordered_segments = sorted(segments, key=len)
  if debug:
    print "segments found", ordered_segments
  return ordered_segments[0]


if __name__ == "__main__":
  #print shortest_segment(['dog'], "dog eat dog")
  #print shortest_segment(['dog'], "dog cat bird")
  #print shortest_segment(['dog', 'cat'], "dog cat bird")
  #print shortest_segment(['dog', 'bird'], "dog cat bird")
  #print shortest_segment(['weird', 'case'], "weird would be what I would call "
  #    "this weird case")

  #words = ["new", "york", "democrats"]
  #haystack = """It was the latest twist in a battle that has
  #captivated New York politics, with consequences that could reach
  #far beyond the issue of childhood education, and a clear signal
  #that the tension between the state's two most powerful Democrats
  #is not likely to abate any time soon.
  #"""
  #print shortest_segment(words, haystack, debug=True)

  words = ["landmark", "city", "bridge"]
  haystack = """The George Washington Bridge in New York City is
  one of the oldest bridges ever constructed. It is now being
  remodeled because the bridge is a landmark. City officials say
  that the landmark bridge effort will create a lot of new jobs in
  the city."""
  print shortest_segment(words, haystack, debug=False)
	#!/usr/bin/env python
	"""phrase_finder.py

	Find the shortest segment of haystack text containing ALL of the needle words

	SOME LICENSE BLAH BLAH...

	Author: Trey Stout <treystout@gmail.com>
	"""
	import itertools
	import re
	import sys


	def shortest_segment(words, haystack, debug=False):
	"""returns the shortest string possible found in `haystack` that contains
	every word found in the iterable `words`

	raises `RuntimeError` if one of the words isn't in haystack
	"""
	NEEDLE_TMPL = r'\b%s\b' # a regex template with word boundaries
	# sanitize our haystack a bit
	haystack = re.sub('\s+', ' ', haystack.lower().strip())

	# exit early if one of our needle words is absent from the input text
	for word in words:
	needle = NEEDLE_TMPL % word
	if not re.search(needle, haystack):
	raise RuntimeError, "'s' was not found in the input text" % word

	# construct a regular expression that's basically just an OR of each of our
	# needle words. Ensure each word is separated by word boundaries.
	needle = "\|".join(NEEDLE_TMPL % w for w in words)

	# create an index of every match of every word
	all_matches = {w: [] for w in words}

	# now get a match object for every word, group duplicates based on word
	for m in re.finditer(needle, haystack):
	all_matches[m.group()].append(m)

	# create all permutations of matches (avoiding duplicate words)
	product = itertools.product(*all_matches.values())
	segments = [] # we'll fill this with candidate segments
	for permutation in product:
	# order this permuation's matches by start point
	ordered = sorted(permutation, key=lambda m: m.start())
	if debug:
	print "checking permutation:", \
	["%d:%s" % (p.start(), p.group()) for p in ordered]

	# assume worst-case for our boundaries
	start = len(haystack)
	end = 0

	# now shrink the bounds based on this permutation
	for match in ordered:
	if debug:
	print "looking at", match.group(), match.start()
	if match.start() < start:
	start = match.start()
	if match.end() > end:
	end = match.end()
	segments.append(haystack[start:end+1])

	# order the segments by their overall length
	ordered_segments = sorted(segments, key=len)
	if debug:
	print "segments found", ordered_segments
	return ordered_segments[0]


	if __name__ == "__main__":
	#print shortest_segment(['dog'], "dog eat dog")
	#print shortest_segment(['dog'], "dog cat bird")
	#print shortest_segment(['dog', 'cat'], "dog cat bird")
	#print shortest_segment(['dog', 'bird'], "dog cat bird")
	#print shortest_segment(['weird', 'case'], "weird would be what I would call "
	# "this weird case")

	#words = ["new", "york", "democrats"]
	#haystack = """It was the latest twist in a battle that has
	#captivated New York politics, with consequences that could reach
	#far beyond the issue of childhood education, and a clear signal
	#that the tension between the state's two most powerful Democrats
	#is not likely to abate any time soon.
	#"""
	#print shortest_segment(words, haystack, debug=True)

	words = ["landmark", "city", "bridge"]
	haystack = """The George Washington Bridge in New York City is
	one of the oldest bridges ever constructed. It is now being
	remodeled because the bridge is a landmark. City officials say
	that the landmark bridge effort will create a lot of new jobs in
	the city."""
	print shortest_segment(words, haystack, debug=False)