kylebgorman/dense-search.py

## dense-search.py
#!/usr/bin/env python
# Python 3.5 or greater, but probably backportable with minimal effort.

"""Creates dense crossword puzzle fills.

This module provides code to fill dense cross-word puzzles. It is a work
in progress.

We represent words in a puzzle as a square matrix of bytes.

The constraints on successor hypotheses are represented as properties of
the 2n "ranks" (rows or columns) rather than as constraints on possible
characters in the n^2 cells in the puzzle. We also prefer word-level
constraints to character-level constraints on the grounds that the
lexicon is much sparser than "sigma-star" (the alphabet).

A constraint is then a property of a rank represented as a hash-backed
set of all the words (represented as indices into a lexicon array) which
could occur in said rank. We cache these sets as they are usually small.
Adding a constraint to a hypothesis simply involves a series of in-place
set intersection operations.

In contrast, the search is primitive. There are two strategies currently
supported. One choses successors randomly. The other chooses successors according to a heuristic that we should fill more-tightly specified
ranks earlier to prevent search errors. Both strategies also rank
successors according to how specified their most-specified rank is,
with less-specified min ranks favored over more-specified ranks.
"""

__author__ = "Kyle Gorman"


# TODO(kbg): These are hard-coded for now; determine them at runtime.
N = 5
BEAM_WIDTH = 1024
CANDIDATE_SUCCESSORS = 16
# TODO(kbg): Use a better wordlist. SUBTLEX? CELEX? Other?
LEXICON = "/usr/share/dict/words"

# TODO(kbg): Make sure not to use a word twice? Otherwise there's a cheap
# strategy of using the word once horizontally and once vertically. (But
# maybe this would be interesting for clueing!)

import copy
import heapq
import logging
import numpy
import random


logging.basicConfig(level="INFO")


# NB: Both Constraint and ConstrainedLexicon are agnostic as to
# orientation (i.e., row vs. column). Only Hypothesis instances need to
# care about orientation.


class Constraint(object):

    """A horizontal or vertical constraint on successors."""

    def __init__(self, char, posish):
        self._char = char
        self._posish = posish

    def satisfy(self, word):
        """Checks whether a word satisfies the constraint."""
        return word[self._posish] == self._char


class ConstrainedLexicon(object):

    """The lexicon, and slices of it with constraints applied."""

    def __init__(self, lexicon):
        self._lexicon = lexicon
        # Items in this are pairs of Constraint (key) and a set of
        # indices into the lexicon (values).
        self._constraint_to_satisfiers = {}

    # A constraint key can be used to the subset of lexical entries
    # (in the form of a set of indices) that satisfy the constraint.
    # These are then intersected with other constraints. The subsets
    # are lazily computed.
    # TODO(kbg): Possibly deallocate subsets after they're not needed
    # anymore.

    def __getitem__(self, constraint):
        satisfiers = self._constraint_to_satisfiers.get(constraint)
        if satisfiers is None:
            satisfiers = {index for (index, successor) in
                          enumerate(self._lexicon) if
                          constraint.satisfy(successor)}
            self._constraint_to_satisfiers[constraint] = satisfiers
        return satisfiers

    def get_entry(self, index):
        """Given an index into the lexicon, return the bytestring."""
        return self._lexicon[satisfier]


class Hypothesis(object):

    """Data structure representing (partial) hypothesis grid."""

    def __init__(self, n):
        self._n = n
        # This part is purely for visualization. Perhaps it should be
        # only computed at the end?
        self._grid = numpy.full((n, n), " ", dtype="S1")
        # Is a row or column empty?
        self._h_empty = {*range(n)}
        self._v_empty = {*range(n)}
        # Are there *any* constraints on the H and V dimensions?
        # TODO(kbg): Perhaps refactor to use the following facts instead:
        # self._h_constrained = len(self._v_empty) < self._n
        # self._v_constrained = len(self._h_empty) < self._n
        self._h_constrained = False
        self._v_constrained = False
        # Where we put the constraint information. Starts empty.
        self._h_satisfiers = [None for _ in range(n)]
        self._v_satisfiers = [None for _ in range(n)]

    # Stringifies puzzle. Probably could be more elegant.

    def __str__(self):
        result = b"=" * self._n
        for row in self._grid:
            result += b"\n" + b"".join(tuple(row))
        result += b"\n"
        result += b"=" * self._n
        return result.decode("UTF-8")

    # Generator for satisfiers.

    def _generate_satisfiers(self, word, index, constrained_lexicon):
        for (i, char) in enumerate(word):
            yield (i, constrained_lexicon[Constraint(char, index)])

    def add_word(self, word, index, is_row, constrained_lexicon):
        """Adds new lexical item to grid."""
        word_array = numpy.fromstring(word, dtype="S1")
        if is_row:
            self._grid[index, :] = word_array
            self._h_empty.remove(index)
            # Creates constraints on the perpendicular dimension.
            if self._v_constrained:
                for (i, satisfiers) in self._generate_satisfiers(word, index,
                                                                 constrained_lexicon):
                    self._v_satisfiers[i] &= satisfiers
            else:
                for (i, satisfiers) in self._generate_satisfiers(word, index,
                                                                 constrained_lexicon):
                    self._v_satisfiers[i] = satisfiers
                self._v_constrained = True
        else:
            self._grid[:, index] = word_array
            self._v_empty.remove(index)
            # Creates constraints on the perpendicular dimension.
            if self._h_constrained:
                for (i, satisfiers) in self._generate_satisfiers(word, index,
                                                                 constrained_lexicon):
                    self._h_satisfiers[i] &= satisfiers
            else:
                for (i, satisfiers) in self._generate_satisfiers(word, index,
                                                                 constrained_lexicon):
                    self._h_satisfiers[i] = satisfiers
                self._h_constrained = True

    @property
    def final(self):
        """Returns whether the hypothesis is complete."""
        return not (self._h_empty or self._v_empty)

    def _h_viable(self):
        if self._h_constrained:
            return all(self._h_satisfiers[i] for i in self._h_empty)
        return True

    def _v_viable(self):
        if self._v_constrained:
            return all(self._v_satisfiers[i] for i in self._v_empty)
        return True

    @property
    def viable(self):
        """Returns whether the hypothesis is viable.

        A hypothesis is "viable" if every constrained dimension has a
        satisfier for each row/column in that dimension.

        This method is NOT valid before the first entry is inserted, nor is it
        valid when the hypothesis is final; DO NOT call it in either case.
        """
        return self._h_viable() and self._v_viable()


    # The score is simply the number of satisfiers for the most-constrained
    # unfilled rank in the puzzle. It is only really comparable across
    # partial hypotheses that have the same number of empty ranks.

    @property
    def score(self):
        best = float("Inf")
        if self._h_constrained and self._h_empty:
            best = min(len(self._h_satisfiers[i]) for i in self._h_empty)
        if self._v_constrained and self._v_empty:
            best = min(best,
                       min(len(self._v_satisfiers[i]) for i in self._v_empty))
        return best

    def get_random_successor(self, constrained_lexicon):
        """Routine randomly generating successors."""
        assert self._h_empty or self._v_empty
        result = copy.deepcopy(self)
        candidates = []
        if self._h_constrained:
            candidates.extend((index, satisfiers, True) for
                              (index, satisfiers) in
                              enumerate(self._h_satisfiers) if
                              index in self._h_empty)
        if self._v_constrained:
            candidates.extend((index, satisfiers, False) for
                              (index, satisfiers) in
                              enumerate(self._v_satisfiers) if
                              index in self._v_empty)
        # Choose a random candidate.
        (index, eset, orientation) = random.choice(candidates)
        word = constrained_lexicon.get_entry(random.choice(tuple(eset)))
        result.add_word(word, index, orientation, constrained_lexicon)
        return result

    # See module-level documentation for a description of this heuristic.

    def get_heuristic_successor(self, constrained_lexicon):
        """Routine generating successors according to a heuristic."""
        assert self._h_empty or self._v_empty
        result = copy.deepcopy(self)
        candidates = []
        if self._h_constrained:
            candidates.extend((index, satisfiers, True) for
                              (index, satisfiers) in
                              enumerate(self._h_satisfiers) if
                              index in self._h_empty)
        if self._v_constrained:
            candidates.extend((index, satisfiers, False) for
                              (index, satisfiers) in
                              enumerate(self._v_satisfiers) if
                              index in self._v_empty)
        # Chooses the most tightly constrained row/column to move on.
        (argmin, minset, orientation) = min(candidates,
                                            key=lambda triple: len(triple[1]))
        word = constrained_lexicon.get_entry(random.choice(tuple(minset)))
        result.add_word(word, argmin, orientation, constrained_lexicon)
        return result


# TODO(kbg): Terrible name; change it.
class Search(object):

    """Data structure for greedy search for a dense crossword puzzle."""

    # TODO(kbg): Way too much work in constructor.

    def __init__(self, n, lexicon, beam_width):
        # Initializations.
        logging.info("N: %d", N)
        self._i = 0  # How far along we are.
        self._n = n
        logging.info("Beam width: %d", BEAM_WIDTH)
        self._beam = []
        self._beam_width = beam_width
        self._constrained_lexicon = ConstrainedLexicon(lexicon)
        # Add initial hypotheses.
        logging.info("Creating initial hypotheses and constrained lexica")
        random_words = random.sample(lexicon, self._beam_width)
        random_indices = numpy.random.randint(0, self._n, self._beam_width)
        # True: row, False: column.
        random_orientations = numpy.random.rand(self._beam_width) > .5
        nonviable = 0
        for (word, index, orientation) in zip(random_words, random_indices,
                                              random_orientations):
            hyp = Hypothesis(self._n)
            # Adds constraints on the other orientation to the hypothesis.
            hyp.add_word(word, index, orientation, self._constrained_lexicon)
            if not hyp.viable:
                nonviable += 1
                continue
            self._beam.append(hyp)
        self._i += 1
        logging.info("Initialization complete")
        logging.info("Nonviable initial hypotheses: %d", nonviable)
        # Now, normal case.
        while self._i < self._n + self._n - 1:
            successors = set()
            for hyp in self._beam:
                for _ in range(CANDIDATE_SUCCESSORS):
                    successor = hyp.get_heuristic_successor(
                        self._constrained_lexicon)
                    if not successor.viable:
                        continue
                    successors.add(successor)
            self._beam = heapq.nlargest(self._beam_width, successors,
                                        key=lambda x: x.score)
            del successors  # Hope this forces a nice garbage collection.
            logging.info("Iteration %d complete", self._i)
            self._i += 1
        # Print out the result.
        logging.info("Final hypotheses: %d", len(self._beam))
        print()
        for hypothesis in self._beam:
            print(hypothesis)
            print()


def get_lexicon(filename, n):
    """Generator producing lexicon of case-folded words of size n."""
    logging.debug("Reading lexicon from file %s", LEXICON)
    with open(LEXICON, "r") as source:
        for line in source:
            if not line.islower():
                continue
            line = line.rstrip()
            if len(line) == n:
                yield bytes(line.lower(), "UTF-8")


if __name__ == "__main__":
    # Reads in lexicon.
    lexicon = list(get_lexicon(LEXICON, N))
    # Constructs and invokes search singleton, printing out the results.
    # TODO(kbg): Weird API (shouldn't it return them? why doing so much
    # work in the constructor) but whatever.
    Search(N, lexicon, BEAM_WIDTH)
	#!/usr/bin/env python
	# Python 3.5 or greater, but probably backportable with minimal effort.

	"""Creates dense crossword puzzle fills.

	This module provides code to fill dense cross-word puzzles. It is a work
	in progress.

	We represent words in a puzzle as a square matrix of bytes.

	The constraints on successor hypotheses are represented as properties of
	the 2n "ranks" (rows or columns) rather than as constraints on possible
	characters in the n^2 cells in the puzzle. We also prefer word-level
	constraints to character-level constraints on the grounds that the
	lexicon is much sparser than "sigma-star" (the alphabet).

	A constraint is then a property of a rank represented as a hash-backed
	set of all the words (represented as indices into a lexicon array) which
	could occur in said rank. We cache these sets as they are usually small.
	Adding a constraint to a hypothesis simply involves a series of in-place
	set intersection operations.

	In contrast, the search is primitive. There are two strategies currently
	supported. One choses successors randomly. The other chooses successors according to a heuristic that we should fill more-tightly specified
	ranks earlier to prevent search errors. Both strategies also rank
	successors according to how specified their most-specified rank is,
	with less-specified min ranks favored over more-specified ranks.
	"""

	__author__ = "Kyle Gorman"


	# TODO(kbg): These are hard-coded for now; determine them at runtime.
	N = 5
	BEAM_WIDTH = 1024
	CANDIDATE_SUCCESSORS = 16
	# TODO(kbg): Use a better wordlist. SUBTLEX? CELEX? Other?
	LEXICON = "/usr/share/dict/words"

	# TODO(kbg): Make sure not to use a word twice? Otherwise there's a cheap
	# strategy of using the word once horizontally and once vertically. (But
	# maybe this would be interesting for clueing!)

	import copy
	import heapq
	import logging
	import numpy
	import random


	logging.basicConfig(level="INFO")


	# NB: Both Constraint and ConstrainedLexicon are agnostic as to
	# orientation (i.e., row vs. column). Only Hypothesis instances need to
	# care about orientation.


	class Constraint(object):

	"""A horizontal or vertical constraint on successors."""

	def __init__(self, char, posish):
	self._char = char
	self._posish = posish

	def satisfy(self, word):
	"""Checks whether a word satisfies the constraint."""
	return word[self._posish] == self._char


	class ConstrainedLexicon(object):

	"""The lexicon, and slices of it with constraints applied."""

	def __init__(self, lexicon):
	self._lexicon = lexicon
	# Items in this are pairs of Constraint (key) and a set of
	# indices into the lexicon (values).
	self._constraint_to_satisfiers = {}

	# A constraint key can be used to the subset of lexical entries
	# (in the form of a set of indices) that satisfy the constraint.
	# These are then intersected with other constraints. The subsets
	# are lazily computed.
	# TODO(kbg): Possibly deallocate subsets after they're not needed
	# anymore.

	def __getitem__(self, constraint):
	satisfiers = self._constraint_to_satisfiers.get(constraint)
	if satisfiers is None:
	satisfiers = {index for (index, successor) in
	enumerate(self._lexicon) if
	constraint.satisfy(successor)}
	self._constraint_to_satisfiers[constraint] = satisfiers
	return satisfiers

	def get_entry(self, index):
	"""Given an index into the lexicon, return the bytestring."""
	return self._lexicon[satisfier]


	class Hypothesis(object):

	"""Data structure representing (partial) hypothesis grid."""

	def __init__(self, n):
	self._n = n
	# This part is purely for visualization. Perhaps it should be
	# only computed at the end?
	self._grid = numpy.full((n, n), " ", dtype="S1")
	# Is a row or column empty?
	self._h_empty = {*range(n)}
	self._v_empty = {*range(n)}
	# Are there any constraints on the H and V dimensions?
	# TODO(kbg): Perhaps refactor to use the following facts instead:
	# self._h_constrained = len(self._v_empty) < self._n
	# self._v_constrained = len(self._h_empty) < self._n
	self._h_constrained = False
	self._v_constrained = False
	# Where we put the constraint information. Starts empty.
	self._h_satisfiers = [None for _ in range(n)]
	self._v_satisfiers = [None for _ in range(n)]

	# Stringifies puzzle. Probably could be more elegant.

	def __str__(self):
	result = b"=" * self._n
	for row in self._grid:
	result += b"\n" + b"".join(tuple(row))
	result += b"\n"
	result += b"=" * self._n
	return result.decode("UTF-8")

	# Generator for satisfiers.

	def _generate_satisfiers(self, word, index, constrained_lexicon):
	for (i, char) in enumerate(word):
	yield (i, constrained_lexicon[Constraint(char, index)])

	def add_word(self, word, index, is_row, constrained_lexicon):
	"""Adds new lexical item to grid."""
	word_array = numpy.fromstring(word, dtype="S1")
	if is_row:
	self._grid[index, :] = word_array
	self._h_empty.remove(index)
	# Creates constraints on the perpendicular dimension.
	if self._v_constrained:
	for (i, satisfiers) in self._generate_satisfiers(word, index,
	constrained_lexicon):
	self._v_satisfiers[i] &= satisfiers
	else:
	for (i, satisfiers) in self._generate_satisfiers(word, index,
	constrained_lexicon):
	self._v_satisfiers[i] = satisfiers
	self._v_constrained = True
	else:
	self._grid[:, index] = word_array
	self._v_empty.remove(index)
	# Creates constraints on the perpendicular dimension.
	if self._h_constrained:
	for (i, satisfiers) in self._generate_satisfiers(word, index,
	constrained_lexicon):
	self._h_satisfiers[i] &= satisfiers
	else:
	for (i, satisfiers) in self._generate_satisfiers(word, index,
	constrained_lexicon):
	self._h_satisfiers[i] = satisfiers
	self._h_constrained = True

	@property
	def final(self):
	"""Returns whether the hypothesis is complete."""
	return not (self._h_empty or self._v_empty)

	def _h_viable(self):
	if self._h_constrained:
	return all(self._h_satisfiers[i] for i in self._h_empty)
	return True

	def _v_viable(self):
	if self._v_constrained:
	return all(self._v_satisfiers[i] for i in self._v_empty)
	return True

	@property
	def viable(self):
	"""Returns whether the hypothesis is viable.

	A hypothesis is "viable" if every constrained dimension has a
	satisfier for each row/column in that dimension.

	This method is NOT valid before the first entry is inserted, nor is it
	valid when the hypothesis is final; DO NOT call it in either case.
	"""
	return self._h_viable() and self._v_viable()


	# The score is simply the number of satisfiers for the most-constrained
	# unfilled rank in the puzzle. It is only really comparable across
	# partial hypotheses that have the same number of empty ranks.

	@property
	def score(self):
	best = float("Inf")
	if self._h_constrained and self._h_empty:
	best = min(len(self._h_satisfiers[i]) for i in self._h_empty)
	if self._v_constrained and self._v_empty:
	best = min(best,
	min(len(self._v_satisfiers[i]) for i in self._v_empty))
	return best

	def get_random_successor(self, constrained_lexicon):
	"""Routine randomly generating successors."""
	assert self._h_empty or self._v_empty
	result = copy.deepcopy(self)
	candidates = []
	if self._h_constrained:
	candidates.extend((index, satisfiers, True) for
	(index, satisfiers) in
	enumerate(self._h_satisfiers) if
	index in self._h_empty)
	if self._v_constrained:
	candidates.extend((index, satisfiers, False) for
	(index, satisfiers) in
	enumerate(self._v_satisfiers) if
	index in self._v_empty)
	# Choose a random candidate.
	(index, eset, orientation) = random.choice(candidates)
	word = constrained_lexicon.get_entry(random.choice(tuple(eset)))
	result.add_word(word, index, orientation, constrained_lexicon)
	return result

	# See module-level documentation for a description of this heuristic.

	def get_heuristic_successor(self, constrained_lexicon):
	"""Routine generating successors according to a heuristic."""
	assert self._h_empty or self._v_empty
	result = copy.deepcopy(self)
	candidates = []
	if self._h_constrained:
	candidates.extend((index, satisfiers, True) for
	(index, satisfiers) in
	enumerate(self._h_satisfiers) if
	index in self._h_empty)
	if self._v_constrained:
	candidates.extend((index, satisfiers, False) for
	(index, satisfiers) in
	enumerate(self._v_satisfiers) if
	index in self._v_empty)
	# Chooses the most tightly constrained row/column to move on.
	(argmin, minset, orientation) = min(candidates,
	key=lambda triple: len(triple[1]))
	word = constrained_lexicon.get_entry(random.choice(tuple(minset)))
	result.add_word(word, argmin, orientation, constrained_lexicon)
	return result


	# TODO(kbg): Terrible name; change it.
	class Search(object):

	"""Data structure for greedy search for a dense crossword puzzle."""

	# TODO(kbg): Way too much work in constructor.

	def __init__(self, n, lexicon, beam_width):
	# Initializations.
	logging.info("N: %d", N)
	self._i = 0 # How far along we are.
	self._n = n
	logging.info("Beam width: %d", BEAM_WIDTH)
	self._beam = []
	self._beam_width = beam_width
	self._constrained_lexicon = ConstrainedLexicon(lexicon)
	# Add initial hypotheses.
	logging.info("Creating initial hypotheses and constrained lexica")
	random_words = random.sample(lexicon, self._beam_width)
	random_indices = numpy.random.randint(0, self._n, self._beam_width)
	# True: row, False: column.
	random_orientations = numpy.random.rand(self._beam_width) > .5
	nonviable = 0
	for (word, index, orientation) in zip(random_words, random_indices,
	random_orientations):
	hyp = Hypothesis(self._n)
	# Adds constraints on the other orientation to the hypothesis.
	hyp.add_word(word, index, orientation, self._constrained_lexicon)
	if not hyp.viable:
	nonviable += 1
	continue
	self._beam.append(hyp)
	self._i += 1
	logging.info("Initialization complete")
	logging.info("Nonviable initial hypotheses: %d", nonviable)
	# Now, normal case.
	while self._i < self._n + self._n - 1:
	successors = set()
	for hyp in self._beam:
	for _ in range(CANDIDATE_SUCCESSORS):
	successor = hyp.get_heuristic_successor(
	self._constrained_lexicon)
	if not successor.viable:
	continue
	successors.add(successor)
	self._beam = heapq.nlargest(self._beam_width, successors,
	key=lambda x: x.score)
	del successors # Hope this forces a nice garbage collection.
	logging.info("Iteration %d complete", self._i)
	self._i += 1
	# Print out the result.
	logging.info("Final hypotheses: %d", len(self._beam))
	print()
	for hypothesis in self._beam:
	print(hypothesis)
	print()


	def get_lexicon(filename, n):
	"""Generator producing lexicon of case-folded words of size n."""
	logging.debug("Reading lexicon from file %s", LEXICON)
	with open(LEXICON, "r") as source:
	for line in source:
	if not line.islower():
	continue
	line = line.rstrip()
	if len(line) == n:
	yield bytes(line.lower(), "UTF-8")


	if __name__ == "__main__":
	# Reads in lexicon.
	lexicon = list(get_lexicon(LEXICON, N))
	# Constructs and invokes search singleton, printing out the results.
	# TODO(kbg): Weird API (shouldn't it return them? why doing so much
	# work in the constructor) but whatever.
	Search(N, lexicon, BEAM_WIDTH)