Created
October 11, 2015 03:19
-
-
Save kylebgorman/0b4be5fea7b7d21c79b2 to your computer and use it in GitHub Desktop.
dense crossword puzzle search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Python 3.5 or greater, but probably backportable with minimal effort. | |
"""Creates dense crossword puzzle fills. | |
This module provides code to fill dense cross-word puzzles. It is a work | |
in progress. | |
We represent words in a puzzle as a square matrix of bytes. | |
The constraints on successor hypotheses are represented as properties of | |
the 2n "ranks" (rows or columns) rather than as constraints on possible | |
characters in the n^2 cells in the puzzle. We also prefer word-level | |
constraints to character-level constraints on the grounds that the | |
lexicon is much sparser than "sigma-star" (the alphabet). | |
A constraint is then a property of a rank represented as a hash-backed | |
set of all the words (represented as indices into a lexicon array) which | |
could occur in said rank. We cache these sets as they are usually small. | |
Adding a constraint to a hypothesis simply involves a series of in-place | |
set intersection operations. | |
In contrast, the search is primitive. There are two strategies currently | |
supported. One choses successors randomly. The other chooses successors according to a heuristic that we should fill more-tightly specified | |
ranks earlier to prevent search errors. Both strategies also rank | |
successors according to how specified their most-specified rank is, | |
with less-specified min ranks favored over more-specified ranks. | |
""" | |
__author__ = "Kyle Gorman" | |
# TODO(kbg): These are hard-coded for now; determine them at runtime. | |
N = 5 | |
BEAM_WIDTH = 1024 | |
CANDIDATE_SUCCESSORS = 16 | |
# TODO(kbg): Use a better wordlist. SUBTLEX? CELEX? Other? | |
LEXICON = "/usr/share/dict/words" | |
# TODO(kbg): Make sure not to use a word twice? Otherwise there's a cheap | |
# strategy of using the word once horizontally and once vertically. (But | |
# maybe this would be interesting for clueing!) | |
import copy | |
import heapq | |
import logging | |
import numpy | |
import random | |
logging.basicConfig(level="INFO") | |
# NB: Both Constraint and ConstrainedLexicon are agnostic as to | |
# orientation (i.e., row vs. column). Only Hypothesis instances need to | |
# care about orientation. | |
class Constraint(object): | |
"""A horizontal or vertical constraint on successors.""" | |
def __init__(self, char, posish): | |
self._char = char | |
self._posish = posish | |
def satisfy(self, word): | |
"""Checks whether a word satisfies the constraint.""" | |
return word[self._posish] == self._char | |
class ConstrainedLexicon(object): | |
"""The lexicon, and slices of it with constraints applied.""" | |
def __init__(self, lexicon): | |
self._lexicon = lexicon | |
# Items in this are pairs of Constraint (key) and a set of | |
# indices into the lexicon (values). | |
self._constraint_to_satisfiers = {} | |
# A constraint key can be used to the subset of lexical entries | |
# (in the form of a set of indices) that satisfy the constraint. | |
# These are then intersected with other constraints. The subsets | |
# are lazily computed. | |
# TODO(kbg): Possibly deallocate subsets after they're not needed | |
# anymore. | |
def __getitem__(self, constraint): | |
satisfiers = self._constraint_to_satisfiers.get(constraint) | |
if satisfiers is None: | |
satisfiers = {index for (index, successor) in | |
enumerate(self._lexicon) if | |
constraint.satisfy(successor)} | |
self._constraint_to_satisfiers[constraint] = satisfiers | |
return satisfiers | |
def get_entry(self, index): | |
"""Given an index into the lexicon, return the bytestring.""" | |
return self._lexicon[satisfier] | |
class Hypothesis(object): | |
"""Data structure representing (partial) hypothesis grid.""" | |
def __init__(self, n): | |
self._n = n | |
# This part is purely for visualization. Perhaps it should be | |
# only computed at the end? | |
self._grid = numpy.full((n, n), " ", dtype="S1") | |
# Is a row or column empty? | |
self._h_empty = {*range(n)} | |
self._v_empty = {*range(n)} | |
# Are there *any* constraints on the H and V dimensions? | |
# TODO(kbg): Perhaps refactor to use the following facts instead: | |
# self._h_constrained = len(self._v_empty) < self._n | |
# self._v_constrained = len(self._h_empty) < self._n | |
self._h_constrained = False | |
self._v_constrained = False | |
# Where we put the constraint information. Starts empty. | |
self._h_satisfiers = [None for _ in range(n)] | |
self._v_satisfiers = [None for _ in range(n)] | |
# Stringifies puzzle. Probably could be more elegant. | |
def __str__(self): | |
result = b"=" * self._n | |
for row in self._grid: | |
result += b"\n" + b"".join(tuple(row)) | |
result += b"\n" | |
result += b"=" * self._n | |
return result.decode("UTF-8") | |
# Generator for satisfiers. | |
def _generate_satisfiers(self, word, index, constrained_lexicon): | |
for (i, char) in enumerate(word): | |
yield (i, constrained_lexicon[Constraint(char, index)]) | |
def add_word(self, word, index, is_row, constrained_lexicon): | |
"""Adds new lexical item to grid.""" | |
word_array = numpy.fromstring(word, dtype="S1") | |
if is_row: | |
self._grid[index, :] = word_array | |
self._h_empty.remove(index) | |
# Creates constraints on the perpendicular dimension. | |
if self._v_constrained: | |
for (i, satisfiers) in self._generate_satisfiers(word, index, | |
constrained_lexicon): | |
self._v_satisfiers[i] &= satisfiers | |
else: | |
for (i, satisfiers) in self._generate_satisfiers(word, index, | |
constrained_lexicon): | |
self._v_satisfiers[i] = satisfiers | |
self._v_constrained = True | |
else: | |
self._grid[:, index] = word_array | |
self._v_empty.remove(index) | |
# Creates constraints on the perpendicular dimension. | |
if self._h_constrained: | |
for (i, satisfiers) in self._generate_satisfiers(word, index, | |
constrained_lexicon): | |
self._h_satisfiers[i] &= satisfiers | |
else: | |
for (i, satisfiers) in self._generate_satisfiers(word, index, | |
constrained_lexicon): | |
self._h_satisfiers[i] = satisfiers | |
self._h_constrained = True | |
@property | |
def final(self): | |
"""Returns whether the hypothesis is complete.""" | |
return not (self._h_empty or self._v_empty) | |
def _h_viable(self): | |
if self._h_constrained: | |
return all(self._h_satisfiers[i] for i in self._h_empty) | |
return True | |
def _v_viable(self): | |
if self._v_constrained: | |
return all(self._v_satisfiers[i] for i in self._v_empty) | |
return True | |
@property | |
def viable(self): | |
"""Returns whether the hypothesis is viable. | |
A hypothesis is "viable" if every constrained dimension has a | |
satisfier for each row/column in that dimension. | |
This method is NOT valid before the first entry is inserted, nor is it | |
valid when the hypothesis is final; DO NOT call it in either case. | |
""" | |
return self._h_viable() and self._v_viable() | |
# The score is simply the number of satisfiers for the most-constrained | |
# unfilled rank in the puzzle. It is only really comparable across | |
# partial hypotheses that have the same number of empty ranks. | |
@property | |
def score(self): | |
best = float("Inf") | |
if self._h_constrained and self._h_empty: | |
best = min(len(self._h_satisfiers[i]) for i in self._h_empty) | |
if self._v_constrained and self._v_empty: | |
best = min(best, | |
min(len(self._v_satisfiers[i]) for i in self._v_empty)) | |
return best | |
def get_random_successor(self, constrained_lexicon): | |
"""Routine randomly generating successors.""" | |
assert self._h_empty or self._v_empty | |
result = copy.deepcopy(self) | |
candidates = [] | |
if self._h_constrained: | |
candidates.extend((index, satisfiers, True) for | |
(index, satisfiers) in | |
enumerate(self._h_satisfiers) if | |
index in self._h_empty) | |
if self._v_constrained: | |
candidates.extend((index, satisfiers, False) for | |
(index, satisfiers) in | |
enumerate(self._v_satisfiers) if | |
index in self._v_empty) | |
# Choose a random candidate. | |
(index, eset, orientation) = random.choice(candidates) | |
word = constrained_lexicon.get_entry(random.choice(tuple(eset))) | |
result.add_word(word, index, orientation, constrained_lexicon) | |
return result | |
# See module-level documentation for a description of this heuristic. | |
def get_heuristic_successor(self, constrained_lexicon): | |
"""Routine generating successors according to a heuristic.""" | |
assert self._h_empty or self._v_empty | |
result = copy.deepcopy(self) | |
candidates = [] | |
if self._h_constrained: | |
candidates.extend((index, satisfiers, True) for | |
(index, satisfiers) in | |
enumerate(self._h_satisfiers) if | |
index in self._h_empty) | |
if self._v_constrained: | |
candidates.extend((index, satisfiers, False) for | |
(index, satisfiers) in | |
enumerate(self._v_satisfiers) if | |
index in self._v_empty) | |
# Chooses the most tightly constrained row/column to move on. | |
(argmin, minset, orientation) = min(candidates, | |
key=lambda triple: len(triple[1])) | |
word = constrained_lexicon.get_entry(random.choice(tuple(minset))) | |
result.add_word(word, argmin, orientation, constrained_lexicon) | |
return result | |
# TODO(kbg): Terrible name; change it. | |
class Search(object): | |
"""Data structure for greedy search for a dense crossword puzzle.""" | |
# TODO(kbg): Way too much work in constructor. | |
def __init__(self, n, lexicon, beam_width): | |
# Initializations. | |
logging.info("N: %d", N) | |
self._i = 0 # How far along we are. | |
self._n = n | |
logging.info("Beam width: %d", BEAM_WIDTH) | |
self._beam = [] | |
self._beam_width = beam_width | |
self._constrained_lexicon = ConstrainedLexicon(lexicon) | |
# Add initial hypotheses. | |
logging.info("Creating initial hypotheses and constrained lexica") | |
random_words = random.sample(lexicon, self._beam_width) | |
random_indices = numpy.random.randint(0, self._n, self._beam_width) | |
# True: row, False: column. | |
random_orientations = numpy.random.rand(self._beam_width) > .5 | |
nonviable = 0 | |
for (word, index, orientation) in zip(random_words, random_indices, | |
random_orientations): | |
hyp = Hypothesis(self._n) | |
# Adds constraints on the other orientation to the hypothesis. | |
hyp.add_word(word, index, orientation, self._constrained_lexicon) | |
if not hyp.viable: | |
nonviable += 1 | |
continue | |
self._beam.append(hyp) | |
self._i += 1 | |
logging.info("Initialization complete") | |
logging.info("Nonviable initial hypotheses: %d", nonviable) | |
# Now, normal case. | |
while self._i < self._n + self._n - 1: | |
successors = set() | |
for hyp in self._beam: | |
for _ in range(CANDIDATE_SUCCESSORS): | |
successor = hyp.get_heuristic_successor( | |
self._constrained_lexicon) | |
if not successor.viable: | |
continue | |
successors.add(successor) | |
self._beam = heapq.nlargest(self._beam_width, successors, | |
key=lambda x: x.score) | |
del successors # Hope this forces a nice garbage collection. | |
logging.info("Iteration %d complete", self._i) | |
self._i += 1 | |
# Print out the result. | |
logging.info("Final hypotheses: %d", len(self._beam)) | |
print() | |
for hypothesis in self._beam: | |
print(hypothesis) | |
print() | |
def get_lexicon(filename, n): | |
"""Generator producing lexicon of case-folded words of size n.""" | |
logging.debug("Reading lexicon from file %s", LEXICON) | |
with open(LEXICON, "r") as source: | |
for line in source: | |
if not line.islower(): | |
continue | |
line = line.rstrip() | |
if len(line) == n: | |
yield bytes(line.lower(), "UTF-8") | |
if __name__ == "__main__": | |
# Reads in lexicon. | |
lexicon = list(get_lexicon(LEXICON, N)) | |
# Constructs and invokes search singleton, printing out the results. | |
# TODO(kbg): Weird API (shouldn't it return them? why doing so much | |
# work in the constructor) but whatever. | |
Search(N, lexicon, BEAM_WIDTH) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment