Skip to content

Instantly share code, notes, and snippets.

@stephen-bunn
Last active October 21, 2016 01:05
Show Gist options
  • Save stephen-bunn/e768e3b21e4201e19587d67979b0edc3 to your computer and use it in GitHub Desktop.
Save stephen-bunn/e768e3b21e4201e19587d67979b0edc3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2016 Ritashugisha
# GNUv3 License. <http://www.gnu.org/licenses/gpl-3.0.en.html>
"""
word_discovery
.. module:: word_discovery
:platform: Linux, MacOSX, Windows
:synopsis:
:created: 2016-09-29 20:40:22
:modified: 2016-10-20 21:04:46
.. moduleauthor:: Ritashugisha (ritashugisha@gmail.com)
"""
import os
import re
import abc
import sys
import enum
import time
import string
import random
class Dimension(enum.Enum):
X = 0x0
Y = 0x1
Z = 0x2
class Lexicon:
__metaclass__ = abc.ABCMeta
@abc.abstractproperty
def words(self):
raise NotImplementedError()
class DwylLexicon(Lexicon):
def __init__(self, filepath):
self._filepath = (filepath if os.path.isfile(filepath) else None)
self._words = []
def __repr__(self):
return (
'<{self.__class__.__name__} "{self._filepath}">'
).format(self=self)
@property
def filepath(self):
return self._filepath
@property
def words(self):
if len(self._words) <= 0:
self._read_words()
return self._words
def _read_words(self):
with open(self._filepath, 'rb') as f:
self._words = [_.decode('utf-8') for _ in f.read().splitlines()]
class LexiconDiscovery(object):
def __init__(self, lexicon, bounded=True):
self._lexicon = (lexicon if isinstance(lexicon, Lexicon) else None)
self._bounded = (bounded if isinstance(bounded, bool) else True)
@property
def lexicon(self):
return self._lexicon
def _build_pattern(self, vector, offset=0):
match = ''
unknown_count = 0
provided_seen = False
for (idx, i) in enumerate(vector):
if idx <= 0:
match += '^'
if not self._bounded:
match += '(?:[a-z]*)'
if i.isalpha():
if unknown_count > 0:
if not provided_seen:
match += '(?:[a-z]{0,%d})' % (unknown_count)
provided_seen = True
else:
match += '(?:[a-z]{%d})' % (unknown_count)
unknown_count = 0
else:
provided_seen = True
# NOTE: Python only named group syntax (match vector offset)
match += '(?P<_%d>%s{1})' % ((offset + idx), i)
else:
unknown_count += 1
if idx >= (len(vector) - 1):
if self._bounded:
if unknown_count > 0:
match += '(?:[a-z]{0,%d})' % (unknown_count)
else:
match += '(?:[a-z]*)'
unknown_count = 0
match += '$'
return (match, offset)
def _split_vector(self, vector):
(sub_vectors, new_vector) = ([], [])
vector_offset = 0
for (idx, i) in enumerate(vector):
if len(i) > 0:
if (
not all([len(_) == 0 for _ in new_vector]) and
len(new_vector[-1]) <= 0
):
sub_vectors.append((new_vector[:-1], vector_offset))
new_vector = ['']
vector_offset = (idx - 1)
new_vector.append(i)
if idx >= (len(vector) - 1):
sub_vectors.append((new_vector, vector_offset))
new_vector = []
vector_offset = 0
for (i, j) in [
(_, (_ + 1))
for _ in range(len(sub_vectors))
if (_ + 1) < len(sub_vectors)
]:
sub_vectors.append((
(sub_vectors[i][0] + sub_vectors[j][0]),
sub_vectors[i][-1]
))
return sub_vectors
def discover(self, vector, sub_vectors=True, merged=True):
vector_space = [(vector, 0)]
if sub_vectors:
vector_space.extend(self._split_vector(vector))
filters = [self._build_pattern(v, offset=o) for (v, o) in vector_space]
matches = []
if not merged:
matches = dict([(_, []) for (_, o) in filters])
re_filters = [(re.compile(_), o) for (_, o) in filters]
for word in self._lexicon.words:
if merged:
if any([_.match(word) for (_, o) in re_filters]):
matches.append(word)
else:
for (pattern, offset) in re_filters:
word_match = pattern.match(word)
if word_match:
start_idx = (offset + word_match.span()[0])
end_idx = (start_idx + (len(word) - 1))
matches[pattern.pattern].append((
word,
(start_idx, end_idx)
))
return matches
if __name__ == '__main__':
BOUND = 16
(EMPTY_CHAR, EMPTY_CHAR_SCALE,) = ('~', 100)
ALPHABET = (string.ascii_lowercase + (EMPTY_CHAR * EMPTY_CHAR_SCALE))
def _get_row(board, n):
return [i.replace(EMPTY_CHAR, '') for i in board[n]]
def _get_col(board, n):
return [row[n].replace(EMPTY_CHAR, '') for row in board]
def _add_word(board, dimension, vecn, word, offset=0):
if not isinstance(dimension, Dimension):
raise ValueError(("must specify valid dimension"))
range_ = (offset, (offset + len(word)),)
if (range_[1] > BOUND):
raise ValueError(("word is too long"))
if dimension == Dimension.X:
board[vecn][range_[0]:range_[1]] = word
elif dimension == Dimension.Y:
for (c, i) in enumerate(range(range_[0], range_[1])):
board[i][vecn] = word[c]
def _get_filled_vects(board):
vects = []
for i_idx in range(len(BOARD_STATE)):
vect = _get_row(BOARD_STATE, i_idx)
if not all(len(_) <= 0 for _ in vect):
vects.append((Dimension.X, i_idx, vect,))
for i_idx in range(len(BOARD_STATE[0])):
vect = _get_col(BOARD_STATE, i_idx)
if not all(len(_) <= 0 for _ in vect):
vects.append((Dimension.Y, i_idx, vect,))
return vects
random_board_state = [
[random.choice(ALPHABET) for j in range(BOUND)]
for i in range(BOUND)
]
bounded_board_state = [
[EMPTY_CHAR for j in range(BOUND)]
for i in range(BOUND)
]
_add_word(bounded_board_state, Dimension.Y, 3, 'something', offset=3)
_add_word(bounded_board_state, Dimension.X, 3, 'solar', offset=3)
_add_word(bounded_board_state, Dimension.X, 7, 'sitting', offset=1)
_add_word(bounded_board_state, Dimension.X, 2, 'no', offset=4)
_add_word(bounded_board_state, Dimension.Y, 5, 'old', offset=2)
_add_word(bounded_board_state, Dimension.Y, 7, 'guana', offset=7)
_add_word(bounded_board_state, Dimension.X, 11, 'guana', offset=3)
_add_word(bounded_board_state, Dimension.X, 9, 'annie', offset=7)
BOARD_STATE = bounded_board_state
for (i_idx, i) in enumerate(BOARD_STATE):
sys.stdout.write('{:2d} | '.format(i_idx))
for (j_idx, j) in enumerate(i):
sys.stdout.write('{} '.format(j))
sys.stdout.write('\n')
search_space = _get_filled_vects(BOARD_STATE)
l = DwylLexicon('C:/Users/r/Downloads/words.txt')
d = LexiconDiscovery(l)
results = {}
for (i_idx, (dimension, n, v)) in enumerate(search_space):
results[hash(dimension) + n] = d.discover(v, merged=False)
print('{}/{}'.format((i_idx + 1), len(search_space)))
print(results[hash(Dimension.Y) + 4])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment