Created
March 16, 2013 17:42
-
-
Save mstepniowski/45f6dc6cd8d9fc288eea to your computer and use it in GitHub Desktop.
PyCon 2013 Thumbtack Anagram Challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from collections import defaultdict | |
from itertools import combinations, chain | |
def powerset(iterable): | |
"""powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)""" | |
s = list(iterable) | |
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1)) | |
def words(pairs): | |
"""Given a list of word pairs, return the set of words in these pairs. | |
""" | |
result = set() | |
for pair in pairs: | |
result.add(pair[0]) | |
result.add(pair[1]) | |
return result | |
def find_anagrams(text): | |
"""Given text as string, yield all anagram groups in the text, as word pairs.""" | |
words = set(w.lower() for w in re.split(r'[^a-zA-Z0-9]+', text) if len(w) > 3) | |
# Build index | |
d = defaultdict(set) | |
for w1 in words: | |
for w2 in words: | |
if w1 < w2: | |
d[''.join(sorted(w1 + w2))].add((w1, w2)) | |
# Find anagrams | |
for pair_set in d.values(): | |
if len(pair_set) >= 2: | |
# Possible anagram pair, check if there is no word shared | |
pair_list = list(pair_set) | |
common_words = set(pair_list[0]) | |
for word_pair in pair_list[1:]: | |
common_words.intersection_update(set(word_pair)) | |
if len(common_words) == 0: | |
yield pair_list | |
def normalize_group(pairs): | |
"""Given a list of word pairs, return the longest list of word | |
pairs, such that no word in the list is repeated. | |
""" | |
max_word_pairs = [] | |
for word_pairs in powerset(pairs): | |
unique_words = words(word_pairs) | |
if len(unique_words) == 2 * len(word_pairs): | |
max_word_pairs = word_pairs | |
return max_word_pairs | |
if __name__ == '__main__': | |
anagram_groups = find_anagrams(sys.stdin.read()) | |
by_length = defaultdict(list) | |
for group in anagram_groups: | |
normalized_group = normalize_group(group) | |
by_length[len(normalized_group)].append(normalized_group) | |
max_length = max(by_length.keys()) | |
for group in by_length[max_length]: | |
print ', '.join('%s %s' % (w1, w2) for (w1, w2) in group) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment