strongriley/highlight.py

## highlight.py
"""
EAT Club Highlight Programming Challenge
Author: Riley Strong <eponymous@rileystrong.com>
Last Updated: 2013-03-09
"""
import re
import sys
import unittest

START_HIGHLIGHT = "[HIGHLIGHT]"
END_HIGHLIGHT = "[ENDHIGHLIGHT]"
WORD_REGEX = r"[\w\d']+"
SENTENCE_ENDERS = ".?!"

# Adjust this to loosen or tighten the match clusters
MAX_ALLOWED_MATCH_DISTANCE = 5


def ends_sentence(word):
    return any(c in SENTENCE_ENDERS for c in word)


def highlight(doc, query):
    """
    Highlights the most relevant part of the document based on the query.
    Works by identifying the largest cluster of matched words.
    """
    # Extract words.
    clean_words = [w.lower() for w in re.findall(WORD_REGEX, doc)]
    query_words = [w.lower() for w in re.findall(WORD_REGEX, query)]

    # Find positions of matches.
    matches = []
    i = 0
    for word in clean_words:
        for query_word in query_words:
            if word == query_word:
                matches.append(i)
        i = i + 1
    if len(matches) == 0:
        return None

    # Calculate distances between matches.
    distances = []
    for i in range(1, len(matches)):
        distances.append(matches[i] - matches[i - 1])

    # Create clusters of matches based on distances.
    match_i = 0
    cluster_i = 0
    clusters = []
    for match in matches:
        # build new cluster.
        if cluster_i + 1 > len(clusters):
            clusters.append([matches[match_i]])
        else:
            clusters[cluster_i].append(matches[match_i])
        # Determine if this is the last element in the cluster.
        if match_i + 1 < len(matches):
            distance = distances[match_i]
            if distance > MAX_ALLOWED_MATCH_DISTANCE:
                cluster_i += 1
        match_i += 1

    # Choose largest cluster.
    max_cluster_size = 0
    largest_cluster = None
    for cluster in clusters:
        length = len(cluster)
        if length > max_cluster_size:
            max_cluster_size = length
            largest_cluster = cluster

    # Build text around cluster.
    # Work from last sentence through end of last match's sentence.
    words = re.split(r' |  ', doc)
    wordcount = len(words)
    snippet_begins_at = None
    if largest_cluster[0] == 0:
        snippet_begins_at = 0
    else:
        # Loop backwards to find closest end of sentence.
        for i in range(largest_cluster[0] - 1, -1, -1):
            if ends_sentence(words[i]):
                snippet_begins_at = i + 1  # The next word.
                break
        if not snippet_begins_at:
            snippet_begins_at = 0

    snippet_ends_at = None
    if largest_cluster[-1] == wordcount - 1:
        snippet_ends_at = wordcount
    else:
        # Loop forward from last match in cluster.
        for i in range(largest_cluster[-1], wordcount):
            if ends_sentence(words[i]):
                snippet_ends_at = i + 1
                break
        if not snippet_ends_at:
            snippet_ends_at = wordcount

    # Apply highlighting
    highlighting = False
    formatted_text = ''
    for i in range(snippet_begins_at, snippet_ends_at):
        if highlighting:
            if i not in largest_cluster:
                formatted_text += END_HIGHLIGHT
                highlighting = False
            formatted_text = ' '.join((formatted_text, words[i]))
        else:
            formatted_text += " "
            if i in largest_cluster:
                formatted_text += START_HIGHLIGHT
                highlighting = True
            formatted_text += words[i]
        if highlighting and i + 1 == snippet_ends_at:
            formatted_text += END_HIGHLIGHT
            highlighting = False
    return formatted_text.lstrip()


class HighlightTests(unittest.TestCase):
    """
    Tests the snippet and highlighting abilities.
    TODO more tests:
    * Different punctuation
    * Non-alphanumeric words
    * match is 1st word.
    * match is last word.
    """
    def test_full_match(self):
        """
        Tests the entire query phrase is matched.
        """
        doc = ' '.join(("Sweet Basil has great chicken pad thai.",
                        "The duck curry is also delicious. The Thai iced",
                        "tea was a bit too sweet though."))
        query = "chicken pad thai"
        expected = ''.join(("Sweet Basil has great ", START_HIGHLIGHT,
                           "chicken pad thai.", END_HIGHLIGHT))
        self.assertEqual(highlight(doc, query), expected)

    def test_partial_match(self):
        """
        Tests a partial match of the query still works.
        """
        doc = ' '.join(("I ordered the pad thai with chicken, which was",
                        "awesome! Now I order it every day."))
        query = "chicken pad thai"
        expected = ''.join(("I ordered the ", START_HIGHLIGHT, "pad thai",
                            END_HIGHLIGHT, " with ", START_HIGHLIGHT,
                            "chicken,", END_HIGHLIGHT, " which was awesome!"))
        self.assertEqual(highlight(doc, query), expected)

    def test_no_match(self):
        """
        Tests when nothing from the query matches.
        """
        doc = ' '.join(("It is a period of civil war. Rebel spaceships,",
            "striking from a hidden base, have won their first victory",
            "against the evil Galactic Empire. During the battle, Rebel",
            "spies managed to steal secret plans to the Empire's ultimate",
            "weapon, the DEATH STAR, an armored space station with enough",
            "power to destroy an entire planet. Pursued by the Empire's",
            "sinister agents, Princess Leia races home aboard her starship,",
            "custodian of the stolen plans that can save her people and",
            "restore freedom to the galaxy..."))
        query = "chicken pad thai"
        self.assertEqual(highlight(doc, query), None)

    def test_multiple_clusters(self):
        """
        Tests if multiple clusters are detected.
        TODO algorithm could be improved, in event of a tie, select cluster
        with shortest distances.
        """
        doc = ' '.join(("I've got a bike. You can ride it if you'd like.",
            "It's got a basket, a bell that rings, and things to make it",))
            #"look good... I know a mouse and he hasn't got a house.",
            #"I don't know why I call him Gerald..."))
        query = "I've bike ride like mouse house"
        expected = ''.join((START_HIGHLIGHT, "I've", END_HIGHLIGHT, " got a ", START_HIGHLIGHT, "bike.",
            END_HIGHLIGHT, " You can ", START_HIGHLIGHT, "ride", END_HIGHLIGHT,
            " it if you'd ", START_HIGHLIGHT, "like.", END_HIGHLIGHT))
        self.assertEqual(highlight(doc, query), expected)


if __name__ == "__main__":
    try:
        doc = sys.argv[1]
        query = sys.argv[2]
    except IndexError:
        unittest.main()
        sys.exit(0)
    result = highlight(doc, query)
    if not result:
        print "No matches found."
    else:
        print result
	"""
	EAT Club Highlight Programming Challenge
	Author: Riley Strong <eponymous@rileystrong.com>
	Last Updated: 2013-03-09
	"""
	import re
	import sys
	import unittest

	START_HIGHLIGHT = "[HIGHLIGHT]"
	END_HIGHLIGHT = "[ENDHIGHLIGHT]"
	WORD_REGEX = r"[\w\d']+"
	SENTENCE_ENDERS = ".?!"

	# Adjust this to loosen or tighten the match clusters
	MAX_ALLOWED_MATCH_DISTANCE = 5


	def ends_sentence(word):
	return any(c in SENTENCE_ENDERS for c in word)


	def highlight(doc, query):
	"""
	Highlights the most relevant part of the document based on the query.
	Works by identifying the largest cluster of matched words.
	"""
	# Extract words.
	clean_words = [w.lower() for w in re.findall(WORD_REGEX, doc)]
	query_words = [w.lower() for w in re.findall(WORD_REGEX, query)]

	# Find positions of matches.
	matches = []
	i = 0
	for word in clean_words:
	for query_word in query_words:
	if word == query_word:
	matches.append(i)
	i = i + 1
	if len(matches) == 0:
	return None

	# Calculate distances between matches.
	distances = []
	for i in range(1, len(matches)):
	distances.append(matches[i] - matches[i - 1])

	# Create clusters of matches based on distances.
	match_i = 0
	cluster_i = 0
	clusters = []
	for match in matches:
	# build new cluster.
	if cluster_i + 1 > len(clusters):
	clusters.append([matches[match_i]])
	else:
	clusters[cluster_i].append(matches[match_i])
	# Determine if this is the last element in the cluster.
	if match_i + 1 < len(matches):
	distance = distances[match_i]
	if distance > MAX_ALLOWED_MATCH_DISTANCE:
	cluster_i += 1
	match_i += 1

	# Choose largest cluster.
	max_cluster_size = 0
	largest_cluster = None
	for cluster in clusters:
	length = len(cluster)
	if length > max_cluster_size:
	max_cluster_size = length
	largest_cluster = cluster

	# Build text around cluster.
	# Work from last sentence through end of last match's sentence.
	words = re.split(r' \| ', doc)
	wordcount = len(words)
	snippet_begins_at = None
	if largest_cluster[0] == 0:
	snippet_begins_at = 0
	else:
	# Loop backwards to find closest end of sentence.
	for i in range(largest_cluster[0] - 1, -1, -1):
	if ends_sentence(words[i]):
	snippet_begins_at = i + 1 # The next word.
	break
	if not snippet_begins_at:
	snippet_begins_at = 0

	snippet_ends_at = None
	if largest_cluster[-1] == wordcount - 1:
	snippet_ends_at = wordcount
	else:
	# Loop forward from last match in cluster.
	for i in range(largest_cluster[-1], wordcount):
	if ends_sentence(words[i]):
	snippet_ends_at = i + 1
	break
	if not snippet_ends_at:
	snippet_ends_at = wordcount

	# Apply highlighting
	highlighting = False
	formatted_text = ''
	for i in range(snippet_begins_at, snippet_ends_at):
	if highlighting:
	if i not in largest_cluster:
	formatted_text += END_HIGHLIGHT
	highlighting = False
	formatted_text = ' '.join((formatted_text, words[i]))
	else:
	formatted_text += " "
	if i in largest_cluster:
	formatted_text += START_HIGHLIGHT
	highlighting = True
	formatted_text += words[i]
	if highlighting and i + 1 == snippet_ends_at:
	formatted_text += END_HIGHLIGHT
	highlighting = False
	return formatted_text.lstrip()


	class HighlightTests(unittest.TestCase):
	"""
	Tests the snippet and highlighting abilities.
	TODO more tests:
	* Different punctuation
	* Non-alphanumeric words
	* match is 1st word.
	* match is last word.
	"""
	def test_full_match(self):
	"""
	Tests the entire query phrase is matched.
	"""
	doc = ' '.join(("Sweet Basil has great chicken pad thai.",
	"The duck curry is also delicious. The Thai iced",
	"tea was a bit too sweet though."))
	query = "chicken pad thai"
	expected = ''.join(("Sweet Basil has great ", START_HIGHLIGHT,
	"chicken pad thai.", END_HIGHLIGHT))
	self.assertEqual(highlight(doc, query), expected)

	def test_partial_match(self):
	"""
	Tests a partial match of the query still works.
	"""
	doc = ' '.join(("I ordered the pad thai with chicken, which was",
	"awesome! Now I order it every day."))
	query = "chicken pad thai"
	expected = ''.join(("I ordered the ", START_HIGHLIGHT, "pad thai",
	END_HIGHLIGHT, " with ", START_HIGHLIGHT,
	"chicken,", END_HIGHLIGHT, " which was awesome!"))
	self.assertEqual(highlight(doc, query), expected)

	def test_no_match(self):
	"""
	Tests when nothing from the query matches.
	"""
	doc = ' '.join(("It is a period of civil war. Rebel spaceships,",
	"striking from a hidden base, have won their first victory",
	"against the evil Galactic Empire. During the battle, Rebel",
	"spies managed to steal secret plans to the Empire's ultimate",
	"weapon, the DEATH STAR, an armored space station with enough",
	"power to destroy an entire planet. Pursued by the Empire's",
	"sinister agents, Princess Leia races home aboard her starship,",
	"custodian of the stolen plans that can save her people and",
	"restore freedom to the galaxy..."))
	query = "chicken pad thai"
	self.assertEqual(highlight(doc, query), None)

	def test_multiple_clusters(self):
	"""
	Tests if multiple clusters are detected.
	TODO algorithm could be improved, in event of a tie, select cluster
	with shortest distances.
	"""
	doc = ' '.join(("I've got a bike. You can ride it if you'd like.",
	"It's got a basket, a bell that rings, and things to make it",))
	#"look good... I know a mouse and he hasn't got a house.",
	#"I don't know why I call him Gerald..."))
	query = "I've bike ride like mouse house"
	expected = ''.join((START_HIGHLIGHT, "I've", END_HIGHLIGHT, " got a ", START_HIGHLIGHT, "bike.",
	END_HIGHLIGHT, " You can ", START_HIGHLIGHT, "ride", END_HIGHLIGHT,
	" it if you'd ", START_HIGHLIGHT, "like.", END_HIGHLIGHT))
	self.assertEqual(highlight(doc, query), expected)


	if __name__ == "__main__":
	try:
	doc = sys.argv[1]
	query = sys.argv[2]
	except IndexError:
	unittest.main()
	sys.exit(0)
	result = highlight(doc, query)
	if not result:
	print "No matches found."
	else:
	print result