stevenkaras/disjoint-words.py

## disjoint-words.py
# /usr/bin/env python

# The point of this exercise is to find a list of 5 words with 25 unique letters across them.
# As suggested by https://www.youtube.com/watch?v=_-AfhLQfb6w
#
# This problem is equivalent to searching for a disjoint set in the graph induced on shared letters between words.
# That problem is equivalent to searching for a clique in the inverse graph.
# The approach used here builds the graph as edges first between letters, then between words.
# Each step increases the cliques it found by 1 word.
#
# For the pre-NYT wordle list, there are 10 solutions across 5144 words with unique letters, which induce 4651264 edges in the graph.
# Disjoint sets are as follows: 2=2325632, 3=57262297, 4=12058120, 5=10
# On my 10-year old laptop, it takes about an hour to discover all results

# Loaded list of 5144 words
# Built word edge list with 4651264 edges
# took 4.800 seconds
# First step produced 2325632 candidates
# took 4.129 seconds
# Second step produced 57262297 candidates
# took 275.470 seconds
# Third step produced 12058120 candidates
# took 2420.292 seconds
# Fourth step produced 10 solutions
# took 557.635 seconds
# {('clipt', 'jumby', 'kreng', 'vozhd', 'waqfs'), ('brung', 'kempt', 'vozhd', 'waqfs', 'xylic'), ('bling', 'jumpy', 'treck', 'vozhd', 'waqfs'), ('glent', 'jumby', 'prick', 'vozhd', 'waqfs'), ('jumby', 'pling', 'treck', 'vozhd', 'waqfs'), ('chunk', 'fjord', 'gymps', 'vibex', 'waltz'), ('bemix', 'clunk', 'grypt', 'vozhd', 'waqfs'), ('brick', 'glent', 'jumpy', 'vozhd', 'waqfs'), ('fjord', 'gucks', 'nymph', 'vibex', 'waltz'), ('blunk', 'cimex', 'grypt', 'vozhd', 'waqfs')}

from collections.abc import Iterable
import contextlib
import time
import functools


@contextlib.contextmanager
def timed(*, start=None, title=None):
    if title is None:
        title = ""
    else:
        title += " "
    if start is None:
        start = time.time()
    yield
    ended = time.time()
    print(f"{title}took {ended - start:.3f} seconds")


def load_word_list(path) -> set[str]:
    with open(path) as f:
        words = f.read().split("\n")

    words = [word.lower() for word in words]  # lowercase all the words
    words = [word for word in words if word.isalpha() and word.isascii()]  # discard any words with non-ascii letters
    words = [word for word in words if len(word) == 5]  # discard words that aren't 5 letters long
    words = [word for word in words if len(set(word)) == len(word)]  # discard words with repeated letters (e.g. abase)
    words = list({tuple(sorted(word)): word for word in words}.values())  # discard anagrams
    return set(words)


def words_letters(words: Iterable[str]) -> set[str]:
    return set(letter for word in words for letter in word)


def words_for_word(word: str, all_words: set[str], words_by_letter: dict[str, set[str]]) -> set[str]:
    letters = set(word)
    words_with_shared = set()
    for letter in letters:
        words_with_shared |= words_by_letter[letter]

    return all_words - words_with_shared


def build_words_by_word(all_words: set[str]) -> dict[str, set[str]]:
    all_letters = set(letter for word in all_words for letter in word)
    words_by_letter = {letter: set(word for word in all_words if letter in word) for letter in all_letters}
    words_by_word = {word: words_for_word(word, all_words, words_by_letter=words_by_letter) for word in all_words}
    return words_by_word


def search_step(candidates: Iterable[tuple[str]], words_by_word: dict[str, set[str]]) -> set[tuple[str]]:
    results = set()
    for candidate in candidates:
        next_steps = [words_by_word[word] for word in candidate]
        next_words = functools.reduce(lambda a, b: a & b, next_steps)
        for next_word in next_words:
            results.add(tuple(sorted(list(candidate) + [next_word])))

    return results


def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="Find sets of 5 words with 5 unique letters. AKA the wordle set challenge"
    )
    parser.add_argument(
        "path",
        metavar="WORDS",
        default="/usr/share/dict/words",
        help="The words file to use. Defaults to /usr/share/dict/words",
    )
    args = parser.parse_args()
    all_words = load_word_list(args.path)
    print(f"Loaded list of {len(all_words)} words")

    with timed():
        words_by_word = build_words_by_word(all_words)
        print(f"Built word edge list with {sum(len(ws) for ws in words_by_word.values())} edges")

    with timed():
        step1 = search_step([(word,) for word in all_words], words_by_word=words_by_word)
        print(f"First step produced {len(step1)} candidates")

    with timed():
        step2 = search_step(step1, words_by_word=words_by_word)
        print(f"Second step produced {len(step2)} candidates")

    with timed():
        step3 = search_step(step2, words_by_word=words_by_word)
        print(f"Third step produced {len(step3)} candidates")

    with timed():
        step4 = search_step(step3, words_by_word=words_by_word)
        print(f"Fourth step produced {len(step4)} solutions")

    print(step4)


if __name__ == "__main__":
    main()
	# /usr/bin/env python

	# The point of this exercise is to find a list of 5 words with 25 unique letters across them.
	# As suggested by https://www.youtube.com/watch?v=_-AfhLQfb6w
	#
	# This problem is equivalent to searching for a disjoint set in the graph induced on shared letters between words.
	# That problem is equivalent to searching for a clique in the inverse graph.
	# The approach used here builds the graph as edges first between letters, then between words.
	# Each step increases the cliques it found by 1 word.
	#
	# For the pre-NYT wordle list, there are 10 solutions across 5144 words with unique letters, which induce 4651264 edges in the graph.
	# Disjoint sets are as follows: 2=2325632, 3=57262297, 4=12058120, 5=10
	# On my 10-year old laptop, it takes about an hour to discover all results

	# Loaded list of 5144 words
	# Built word edge list with 4651264 edges
	# took 4.800 seconds
	# First step produced 2325632 candidates
	# took 4.129 seconds
	# Second step produced 57262297 candidates
	# took 275.470 seconds
	# Third step produced 12058120 candidates
	# took 2420.292 seconds
	# Fourth step produced 10 solutions
	# took 557.635 seconds
	# {('clipt', 'jumby', 'kreng', 'vozhd', 'waqfs'), ('brung', 'kempt', 'vozhd', 'waqfs', 'xylic'), ('bling', 'jumpy', 'treck', 'vozhd', 'waqfs'), ('glent', 'jumby', 'prick', 'vozhd', 'waqfs'), ('jumby', 'pling', 'treck', 'vozhd', 'waqfs'), ('chunk', 'fjord', 'gymps', 'vibex', 'waltz'), ('bemix', 'clunk', 'grypt', 'vozhd', 'waqfs'), ('brick', 'glent', 'jumpy', 'vozhd', 'waqfs'), ('fjord', 'gucks', 'nymph', 'vibex', 'waltz'), ('blunk', 'cimex', 'grypt', 'vozhd', 'waqfs')}

	from collections.abc import Iterable
	import contextlib
	import time
	import functools


	@contextlib.contextmanager
	def timed(*, start=None, title=None):
	if title is None:
	title = ""
	else:
	title += " "
	if start is None:
	start = time.time()
	yield
	ended = time.time()
	print(f"{title}took {ended - start:.3f} seconds")


	def load_word_list(path) -> set[str]:
	with open(path) as f:
	words = f.read().split("\n")

	words = [word.lower() for word in words] # lowercase all the words
	words = [word for word in words if word.isalpha() and word.isascii()] # discard any words with non-ascii letters
	words = [word for word in words if len(word) == 5] # discard words that aren't 5 letters long
	words = [word for word in words if len(set(word)) == len(word)] # discard words with repeated letters (e.g. abase)
	words = list({tuple(sorted(word)): word for word in words}.values()) # discard anagrams
	return set(words)


	def words_letters(words: Iterable[str]) -> set[str]:
	return set(letter for word in words for letter in word)


	def words_for_word(word: str, all_words: set[str], words_by_letter: dict[str, set[str]]) -> set[str]:
	letters = set(word)
	words_with_shared = set()
	for letter in letters:
	words_with_shared \|= words_by_letter[letter]

	return all_words - words_with_shared


	def build_words_by_word(all_words: set[str]) -> dict[str, set[str]]:
	all_letters = set(letter for word in all_words for letter in word)
	words_by_letter = {letter: set(word for word in all_words if letter in word) for letter in all_letters}
	words_by_word = {word: words_for_word(word, all_words, words_by_letter=words_by_letter) for word in all_words}
	return words_by_word


	def search_step(candidates: Iterable[tuple[str]], words_by_word: dict[str, set[str]]) -> set[tuple[str]]:
	results = set()
	for candidate in candidates:
	next_steps = [words_by_word[word] for word in candidate]
	next_words = functools.reduce(lambda a, b: a & b, next_steps)
	for next_word in next_words:
	results.add(tuple(sorted(list(candidate) + [next_word])))

	return results


	def main():
	import argparse

	parser = argparse.ArgumentParser(
	description="Find sets of 5 words with 5 unique letters. AKA the wordle set challenge"
	)
	parser.add_argument(
	"path",
	metavar="WORDS",
	default="/usr/share/dict/words",
	help="The words file to use. Defaults to /usr/share/dict/words",
	)
	args = parser.parse_args()
	all_words = load_word_list(args.path)
	print(f"Loaded list of {len(all_words)} words")

	with timed():
	words_by_word = build_words_by_word(all_words)
	print(f"Built word edge list with {sum(len(ws) for ws in words_by_word.values())} edges")

	with timed():
	step1 = search_step([(word,) for word in all_words], words_by_word=words_by_word)
	print(f"First step produced {len(step1)} candidates")

	with timed():
	step2 = search_step(step1, words_by_word=words_by_word)
	print(f"Second step produced {len(step2)} candidates")

	with timed():
	step3 = search_step(step2, words_by_word=words_by_word)
	print(f"Third step produced {len(step3)} candidates")

	with timed():
	step4 = search_step(step3, words_by_word=words_by_word)
	print(f"Fourth step produced {len(step4)} solutions")

	print(step4)


	if __name__ == "__main__":
	main()