Skip to content

Instantly share code, notes, and snippets.

@kylebgorman
Created October 11, 2015 03:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylebgorman/0b4be5fea7b7d21c79b2 to your computer and use it in GitHub Desktop.
Save kylebgorman/0b4be5fea7b7d21c79b2 to your computer and use it in GitHub Desktop.
dense crossword puzzle search
#!/usr/bin/env python
# Python 3.5 or greater, but probably backportable with minimal effort.
"""Creates dense crossword puzzle fills.
This module provides code to fill dense cross-word puzzles. It is a work
in progress.
We represent words in a puzzle as a square matrix of bytes.
The constraints on successor hypotheses are represented as properties of
the 2n "ranks" (rows or columns) rather than as constraints on possible
characters in the n^2 cells in the puzzle. We also prefer word-level
constraints to character-level constraints on the grounds that the
lexicon is much sparser than "sigma-star" (the alphabet).
A constraint is then a property of a rank represented as a hash-backed
set of all the words (represented as indices into a lexicon array) which
could occur in said rank. We cache these sets as they are usually small.
Adding a constraint to a hypothesis simply involves a series of in-place
set intersection operations.
In contrast, the search is primitive. There are two strategies currently
supported. One choses successors randomly. The other chooses successors according to a heuristic that we should fill more-tightly specified
ranks earlier to prevent search errors. Both strategies also rank
successors according to how specified their most-specified rank is,
with less-specified min ranks favored over more-specified ranks.
"""
__author__ = "Kyle Gorman"
# TODO(kbg): These are hard-coded for now; determine them at runtime.
N = 5
BEAM_WIDTH = 1024
CANDIDATE_SUCCESSORS = 16
# TODO(kbg): Use a better wordlist. SUBTLEX? CELEX? Other?
LEXICON = "/usr/share/dict/words"
# TODO(kbg): Make sure not to use a word twice? Otherwise there's a cheap
# strategy of using the word once horizontally and once vertically. (But
# maybe this would be interesting for clueing!)
import copy
import heapq
import logging
import numpy
import random
logging.basicConfig(level="INFO")
# NB: Both Constraint and ConstrainedLexicon are agnostic as to
# orientation (i.e., row vs. column). Only Hypothesis instances need to
# care about orientation.
class Constraint(object):
"""A horizontal or vertical constraint on successors."""
def __init__(self, char, posish):
self._char = char
self._posish = posish
def satisfy(self, word):
"""Checks whether a word satisfies the constraint."""
return word[self._posish] == self._char
class ConstrainedLexicon(object):
"""The lexicon, and slices of it with constraints applied."""
def __init__(self, lexicon):
self._lexicon = lexicon
# Items in this are pairs of Constraint (key) and a set of
# indices into the lexicon (values).
self._constraint_to_satisfiers = {}
# A constraint key can be used to the subset of lexical entries
# (in the form of a set of indices) that satisfy the constraint.
# These are then intersected with other constraints. The subsets
# are lazily computed.
# TODO(kbg): Possibly deallocate subsets after they're not needed
# anymore.
def __getitem__(self, constraint):
satisfiers = self._constraint_to_satisfiers.get(constraint)
if satisfiers is None:
satisfiers = {index for (index, successor) in
enumerate(self._lexicon) if
constraint.satisfy(successor)}
self._constraint_to_satisfiers[constraint] = satisfiers
return satisfiers
def get_entry(self, index):
"""Given an index into the lexicon, return the bytestring."""
return self._lexicon[satisfier]
class Hypothesis(object):
"""Data structure representing (partial) hypothesis grid."""
def __init__(self, n):
self._n = n
# This part is purely for visualization. Perhaps it should be
# only computed at the end?
self._grid = numpy.full((n, n), " ", dtype="S1")
# Is a row or column empty?
self._h_empty = {*range(n)}
self._v_empty = {*range(n)}
# Are there *any* constraints on the H and V dimensions?
# TODO(kbg): Perhaps refactor to use the following facts instead:
# self._h_constrained = len(self._v_empty) < self._n
# self._v_constrained = len(self._h_empty) < self._n
self._h_constrained = False
self._v_constrained = False
# Where we put the constraint information. Starts empty.
self._h_satisfiers = [None for _ in range(n)]
self._v_satisfiers = [None for _ in range(n)]
# Stringifies puzzle. Probably could be more elegant.
def __str__(self):
result = b"=" * self._n
for row in self._grid:
result += b"\n" + b"".join(tuple(row))
result += b"\n"
result += b"=" * self._n
return result.decode("UTF-8")
# Generator for satisfiers.
def _generate_satisfiers(self, word, index, constrained_lexicon):
for (i, char) in enumerate(word):
yield (i, constrained_lexicon[Constraint(char, index)])
def add_word(self, word, index, is_row, constrained_lexicon):
"""Adds new lexical item to grid."""
word_array = numpy.fromstring(word, dtype="S1")
if is_row:
self._grid[index, :] = word_array
self._h_empty.remove(index)
# Creates constraints on the perpendicular dimension.
if self._v_constrained:
for (i, satisfiers) in self._generate_satisfiers(word, index,
constrained_lexicon):
self._v_satisfiers[i] &= satisfiers
else:
for (i, satisfiers) in self._generate_satisfiers(word, index,
constrained_lexicon):
self._v_satisfiers[i] = satisfiers
self._v_constrained = True
else:
self._grid[:, index] = word_array
self._v_empty.remove(index)
# Creates constraints on the perpendicular dimension.
if self._h_constrained:
for (i, satisfiers) in self._generate_satisfiers(word, index,
constrained_lexicon):
self._h_satisfiers[i] &= satisfiers
else:
for (i, satisfiers) in self._generate_satisfiers(word, index,
constrained_lexicon):
self._h_satisfiers[i] = satisfiers
self._h_constrained = True
@property
def final(self):
"""Returns whether the hypothesis is complete."""
return not (self._h_empty or self._v_empty)
def _h_viable(self):
if self._h_constrained:
return all(self._h_satisfiers[i] for i in self._h_empty)
return True
def _v_viable(self):
if self._v_constrained:
return all(self._v_satisfiers[i] for i in self._v_empty)
return True
@property
def viable(self):
"""Returns whether the hypothesis is viable.
A hypothesis is "viable" if every constrained dimension has a
satisfier for each row/column in that dimension.
This method is NOT valid before the first entry is inserted, nor is it
valid when the hypothesis is final; DO NOT call it in either case.
"""
return self._h_viable() and self._v_viable()
# The score is simply the number of satisfiers for the most-constrained
# unfilled rank in the puzzle. It is only really comparable across
# partial hypotheses that have the same number of empty ranks.
@property
def score(self):
best = float("Inf")
if self._h_constrained and self._h_empty:
best = min(len(self._h_satisfiers[i]) for i in self._h_empty)
if self._v_constrained and self._v_empty:
best = min(best,
min(len(self._v_satisfiers[i]) for i in self._v_empty))
return best
def get_random_successor(self, constrained_lexicon):
"""Routine randomly generating successors."""
assert self._h_empty or self._v_empty
result = copy.deepcopy(self)
candidates = []
if self._h_constrained:
candidates.extend((index, satisfiers, True) for
(index, satisfiers) in
enumerate(self._h_satisfiers) if
index in self._h_empty)
if self._v_constrained:
candidates.extend((index, satisfiers, False) for
(index, satisfiers) in
enumerate(self._v_satisfiers) if
index in self._v_empty)
# Choose a random candidate.
(index, eset, orientation) = random.choice(candidates)
word = constrained_lexicon.get_entry(random.choice(tuple(eset)))
result.add_word(word, index, orientation, constrained_lexicon)
return result
# See module-level documentation for a description of this heuristic.
def get_heuristic_successor(self, constrained_lexicon):
"""Routine generating successors according to a heuristic."""
assert self._h_empty or self._v_empty
result = copy.deepcopy(self)
candidates = []
if self._h_constrained:
candidates.extend((index, satisfiers, True) for
(index, satisfiers) in
enumerate(self._h_satisfiers) if
index in self._h_empty)
if self._v_constrained:
candidates.extend((index, satisfiers, False) for
(index, satisfiers) in
enumerate(self._v_satisfiers) if
index in self._v_empty)
# Chooses the most tightly constrained row/column to move on.
(argmin, minset, orientation) = min(candidates,
key=lambda triple: len(triple[1]))
word = constrained_lexicon.get_entry(random.choice(tuple(minset)))
result.add_word(word, argmin, orientation, constrained_lexicon)
return result
# TODO(kbg): Terrible name; change it.
class Search(object):
"""Data structure for greedy search for a dense crossword puzzle."""
# TODO(kbg): Way too much work in constructor.
def __init__(self, n, lexicon, beam_width):
# Initializations.
logging.info("N: %d", N)
self._i = 0 # How far along we are.
self._n = n
logging.info("Beam width: %d", BEAM_WIDTH)
self._beam = []
self._beam_width = beam_width
self._constrained_lexicon = ConstrainedLexicon(lexicon)
# Add initial hypotheses.
logging.info("Creating initial hypotheses and constrained lexica")
random_words = random.sample(lexicon, self._beam_width)
random_indices = numpy.random.randint(0, self._n, self._beam_width)
# True: row, False: column.
random_orientations = numpy.random.rand(self._beam_width) > .5
nonviable = 0
for (word, index, orientation) in zip(random_words, random_indices,
random_orientations):
hyp = Hypothesis(self._n)
# Adds constraints on the other orientation to the hypothesis.
hyp.add_word(word, index, orientation, self._constrained_lexicon)
if not hyp.viable:
nonviable += 1
continue
self._beam.append(hyp)
self._i += 1
logging.info("Initialization complete")
logging.info("Nonviable initial hypotheses: %d", nonviable)
# Now, normal case.
while self._i < self._n + self._n - 1:
successors = set()
for hyp in self._beam:
for _ in range(CANDIDATE_SUCCESSORS):
successor = hyp.get_heuristic_successor(
self._constrained_lexicon)
if not successor.viable:
continue
successors.add(successor)
self._beam = heapq.nlargest(self._beam_width, successors,
key=lambda x: x.score)
del successors # Hope this forces a nice garbage collection.
logging.info("Iteration %d complete", self._i)
self._i += 1
# Print out the result.
logging.info("Final hypotheses: %d", len(self._beam))
print()
for hypothesis in self._beam:
print(hypothesis)
print()
def get_lexicon(filename, n):
"""Generator producing lexicon of case-folded words of size n."""
logging.debug("Reading lexicon from file %s", LEXICON)
with open(LEXICON, "r") as source:
for line in source:
if not line.islower():
continue
line = line.rstrip()
if len(line) == n:
yield bytes(line.lower(), "UTF-8")
if __name__ == "__main__":
# Reads in lexicon.
lexicon = list(get_lexicon(LEXICON, N))
# Constructs and invokes search singleton, printing out the results.
# TODO(kbg): Weird API (shouldn't it return them? why doing so much
# work in the constructor) but whatever.
Search(N, lexicon, BEAM_WIDTH)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment