Skip to content

Instantly share code, notes, and snippets.

@strongriley
Created March 10, 2013 22:01
Show Gist options
  • Save strongriley/5130688 to your computer and use it in GitHub Desktop.
Save strongriley/5130688 to your computer and use it in GitHub Desktop.
EAT Club Engineering Test
"""
EAT Club Highlight Programming Challenge
Author: Riley Strong <eponymous@rileystrong.com>
Last Updated: 2013-03-09
"""
import re
import sys
import unittest
START_HIGHLIGHT = "[HIGHLIGHT]"
END_HIGHLIGHT = "[ENDHIGHLIGHT]"
WORD_REGEX = r"[\w\d']+"
SENTENCE_ENDERS = ".?!"
# Adjust this to loosen or tighten the match clusters
MAX_ALLOWED_MATCH_DISTANCE = 5
def ends_sentence(word):
return any(c in SENTENCE_ENDERS for c in word)
def highlight(doc, query):
"""
Highlights the most relevant part of the document based on the query.
Works by identifying the largest cluster of matched words.
"""
# Extract words.
clean_words = [w.lower() for w in re.findall(WORD_REGEX, doc)]
query_words = [w.lower() for w in re.findall(WORD_REGEX, query)]
# Find positions of matches.
matches = []
i = 0
for word in clean_words:
for query_word in query_words:
if word == query_word:
matches.append(i)
i = i + 1
if len(matches) == 0:
return None
# Calculate distances between matches.
distances = []
for i in range(1, len(matches)):
distances.append(matches[i] - matches[i - 1])
# Create clusters of matches based on distances.
match_i = 0
cluster_i = 0
clusters = []
for match in matches:
# build new cluster.
if cluster_i + 1 > len(clusters):
clusters.append([matches[match_i]])
else:
clusters[cluster_i].append(matches[match_i])
# Determine if this is the last element in the cluster.
if match_i + 1 < len(matches):
distance = distances[match_i]
if distance > MAX_ALLOWED_MATCH_DISTANCE:
cluster_i += 1
match_i += 1
# Choose largest cluster.
max_cluster_size = 0
largest_cluster = None
for cluster in clusters:
length = len(cluster)
if length > max_cluster_size:
max_cluster_size = length
largest_cluster = cluster
# Build text around cluster.
# Work from last sentence through end of last match's sentence.
words = re.split(r' | ', doc)
wordcount = len(words)
snippet_begins_at = None
if largest_cluster[0] == 0:
snippet_begins_at = 0
else:
# Loop backwards to find closest end of sentence.
for i in range(largest_cluster[0] - 1, -1, -1):
if ends_sentence(words[i]):
snippet_begins_at = i + 1 # The next word.
break
if not snippet_begins_at:
snippet_begins_at = 0
snippet_ends_at = None
if largest_cluster[-1] == wordcount - 1:
snippet_ends_at = wordcount
else:
# Loop forward from last match in cluster.
for i in range(largest_cluster[-1], wordcount):
if ends_sentence(words[i]):
snippet_ends_at = i + 1
break
if not snippet_ends_at:
snippet_ends_at = wordcount
# Apply highlighting
highlighting = False
formatted_text = ''
for i in range(snippet_begins_at, snippet_ends_at):
if highlighting:
if i not in largest_cluster:
formatted_text += END_HIGHLIGHT
highlighting = False
formatted_text = ' '.join((formatted_text, words[i]))
else:
formatted_text += " "
if i in largest_cluster:
formatted_text += START_HIGHLIGHT
highlighting = True
formatted_text += words[i]
if highlighting and i + 1 == snippet_ends_at:
formatted_text += END_HIGHLIGHT
highlighting = False
return formatted_text.lstrip()
class HighlightTests(unittest.TestCase):
"""
Tests the snippet and highlighting abilities.
TODO more tests:
* Different punctuation
* Non-alphanumeric words
* match is 1st word.
* match is last word.
"""
def test_full_match(self):
"""
Tests the entire query phrase is matched.
"""
doc = ' '.join(("Sweet Basil has great chicken pad thai.",
"The duck curry is also delicious. The Thai iced",
"tea was a bit too sweet though."))
query = "chicken pad thai"
expected = ''.join(("Sweet Basil has great ", START_HIGHLIGHT,
"chicken pad thai.", END_HIGHLIGHT))
self.assertEqual(highlight(doc, query), expected)
def test_partial_match(self):
"""
Tests a partial match of the query still works.
"""
doc = ' '.join(("I ordered the pad thai with chicken, which was",
"awesome! Now I order it every day."))
query = "chicken pad thai"
expected = ''.join(("I ordered the ", START_HIGHLIGHT, "pad thai",
END_HIGHLIGHT, " with ", START_HIGHLIGHT,
"chicken,", END_HIGHLIGHT, " which was awesome!"))
self.assertEqual(highlight(doc, query), expected)
def test_no_match(self):
"""
Tests when nothing from the query matches.
"""
doc = ' '.join(("It is a period of civil war. Rebel spaceships,",
"striking from a hidden base, have won their first victory",
"against the evil Galactic Empire. During the battle, Rebel",
"spies managed to steal secret plans to the Empire's ultimate",
"weapon, the DEATH STAR, an armored space station with enough",
"power to destroy an entire planet. Pursued by the Empire's",
"sinister agents, Princess Leia races home aboard her starship,",
"custodian of the stolen plans that can save her people and",
"restore freedom to the galaxy..."))
query = "chicken pad thai"
self.assertEqual(highlight(doc, query), None)
def test_multiple_clusters(self):
"""
Tests if multiple clusters are detected.
TODO algorithm could be improved, in event of a tie, select cluster
with shortest distances.
"""
doc = ' '.join(("I've got a bike. You can ride it if you'd like.",
"It's got a basket, a bell that rings, and things to make it",))
#"look good... I know a mouse and he hasn't got a house.",
#"I don't know why I call him Gerald..."))
query = "I've bike ride like mouse house"
expected = ''.join((START_HIGHLIGHT, "I've", END_HIGHLIGHT, " got a ", START_HIGHLIGHT, "bike.",
END_HIGHLIGHT, " You can ", START_HIGHLIGHT, "ride", END_HIGHLIGHT,
" it if you'd ", START_HIGHLIGHT, "like.", END_HIGHLIGHT))
self.assertEqual(highlight(doc, query), expected)
if __name__ == "__main__":
try:
doc = sys.argv[1]
query = sys.argv[2]
except IndexError:
unittest.main()
sys.exit(0)
result = highlight(doc, query)
if not result:
print "No matches found."
else:
print result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment