Last active
October 21, 2016 01:05
-
-
Save stephen-bunn/e768e3b21e4201e19587d67979b0edc3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (c) 2016 Ritashugisha | |
# GNUv3 License. <http://www.gnu.org/licenses/gpl-3.0.en.html> | |
""" | |
word_discovery | |
.. module:: word_discovery | |
:platform: Linux, MacOSX, Windows | |
:synopsis: | |
:created: 2016-09-29 20:40:22 | |
:modified: 2016-10-20 21:04:46 | |
.. moduleauthor:: Ritashugisha (ritashugisha@gmail.com) | |
""" | |
import os | |
import re | |
import abc | |
import sys | |
import enum | |
import time | |
import string | |
import random | |
class Dimension(enum.Enum): | |
X = 0x0 | |
Y = 0x1 | |
Z = 0x2 | |
class Lexicon: | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractproperty | |
def words(self): | |
raise NotImplementedError() | |
class DwylLexicon(Lexicon): | |
def __init__(self, filepath): | |
self._filepath = (filepath if os.path.isfile(filepath) else None) | |
self._words = [] | |
def __repr__(self): | |
return ( | |
'<{self.__class__.__name__} "{self._filepath}">' | |
).format(self=self) | |
@property | |
def filepath(self): | |
return self._filepath | |
@property | |
def words(self): | |
if len(self._words) <= 0: | |
self._read_words() | |
return self._words | |
def _read_words(self): | |
with open(self._filepath, 'rb') as f: | |
self._words = [_.decode('utf-8') for _ in f.read().splitlines()] | |
class LexiconDiscovery(object): | |
def __init__(self, lexicon, bounded=True): | |
self._lexicon = (lexicon if isinstance(lexicon, Lexicon) else None) | |
self._bounded = (bounded if isinstance(bounded, bool) else True) | |
@property | |
def lexicon(self): | |
return self._lexicon | |
def _build_pattern(self, vector, offset=0): | |
match = '' | |
unknown_count = 0 | |
provided_seen = False | |
for (idx, i) in enumerate(vector): | |
if idx <= 0: | |
match += '^' | |
if not self._bounded: | |
match += '(?:[a-z]*)' | |
if i.isalpha(): | |
if unknown_count > 0: | |
if not provided_seen: | |
match += '(?:[a-z]{0,%d})' % (unknown_count) | |
provided_seen = True | |
else: | |
match += '(?:[a-z]{%d})' % (unknown_count) | |
unknown_count = 0 | |
else: | |
provided_seen = True | |
# NOTE: Python only named group syntax (match vector offset) | |
match += '(?P<_%d>%s{1})' % ((offset + idx), i) | |
else: | |
unknown_count += 1 | |
if idx >= (len(vector) - 1): | |
if self._bounded: | |
if unknown_count > 0: | |
match += '(?:[a-z]{0,%d})' % (unknown_count) | |
else: | |
match += '(?:[a-z]*)' | |
unknown_count = 0 | |
match += '$' | |
return (match, offset) | |
def _split_vector(self, vector): | |
(sub_vectors, new_vector) = ([], []) | |
vector_offset = 0 | |
for (idx, i) in enumerate(vector): | |
if len(i) > 0: | |
if ( | |
not all([len(_) == 0 for _ in new_vector]) and | |
len(new_vector[-1]) <= 0 | |
): | |
sub_vectors.append((new_vector[:-1], vector_offset)) | |
new_vector = [''] | |
vector_offset = (idx - 1) | |
new_vector.append(i) | |
if idx >= (len(vector) - 1): | |
sub_vectors.append((new_vector, vector_offset)) | |
new_vector = [] | |
vector_offset = 0 | |
for (i, j) in [ | |
(_, (_ + 1)) | |
for _ in range(len(sub_vectors)) | |
if (_ + 1) < len(sub_vectors) | |
]: | |
sub_vectors.append(( | |
(sub_vectors[i][0] + sub_vectors[j][0]), | |
sub_vectors[i][-1] | |
)) | |
return sub_vectors | |
def discover(self, vector, sub_vectors=True, merged=True): | |
vector_space = [(vector, 0)] | |
if sub_vectors: | |
vector_space.extend(self._split_vector(vector)) | |
filters = [self._build_pattern(v, offset=o) for (v, o) in vector_space] | |
matches = [] | |
if not merged: | |
matches = dict([(_, []) for (_, o) in filters]) | |
re_filters = [(re.compile(_), o) for (_, o) in filters] | |
for word in self._lexicon.words: | |
if merged: | |
if any([_.match(word) for (_, o) in re_filters]): | |
matches.append(word) | |
else: | |
for (pattern, offset) in re_filters: | |
word_match = pattern.match(word) | |
if word_match: | |
start_idx = (offset + word_match.span()[0]) | |
end_idx = (start_idx + (len(word) - 1)) | |
matches[pattern.pattern].append(( | |
word, | |
(start_idx, end_idx) | |
)) | |
return matches | |
if __name__ == '__main__': | |
BOUND = 16 | |
(EMPTY_CHAR, EMPTY_CHAR_SCALE,) = ('~', 100) | |
ALPHABET = (string.ascii_lowercase + (EMPTY_CHAR * EMPTY_CHAR_SCALE)) | |
def _get_row(board, n): | |
return [i.replace(EMPTY_CHAR, '') for i in board[n]] | |
def _get_col(board, n): | |
return [row[n].replace(EMPTY_CHAR, '') for row in board] | |
def _add_word(board, dimension, vecn, word, offset=0): | |
if not isinstance(dimension, Dimension): | |
raise ValueError(("must specify valid dimension")) | |
range_ = (offset, (offset + len(word)),) | |
if (range_[1] > BOUND): | |
raise ValueError(("word is too long")) | |
if dimension == Dimension.X: | |
board[vecn][range_[0]:range_[1]] = word | |
elif dimension == Dimension.Y: | |
for (c, i) in enumerate(range(range_[0], range_[1])): | |
board[i][vecn] = word[c] | |
def _get_filled_vects(board): | |
vects = [] | |
for i_idx in range(len(BOARD_STATE)): | |
vect = _get_row(BOARD_STATE, i_idx) | |
if not all(len(_) <= 0 for _ in vect): | |
vects.append((Dimension.X, i_idx, vect,)) | |
for i_idx in range(len(BOARD_STATE[0])): | |
vect = _get_col(BOARD_STATE, i_idx) | |
if not all(len(_) <= 0 for _ in vect): | |
vects.append((Dimension.Y, i_idx, vect,)) | |
return vects | |
random_board_state = [ | |
[random.choice(ALPHABET) for j in range(BOUND)] | |
for i in range(BOUND) | |
] | |
bounded_board_state = [ | |
[EMPTY_CHAR for j in range(BOUND)] | |
for i in range(BOUND) | |
] | |
_add_word(bounded_board_state, Dimension.Y, 3, 'something', offset=3) | |
_add_word(bounded_board_state, Dimension.X, 3, 'solar', offset=3) | |
_add_word(bounded_board_state, Dimension.X, 7, 'sitting', offset=1) | |
_add_word(bounded_board_state, Dimension.X, 2, 'no', offset=4) | |
_add_word(bounded_board_state, Dimension.Y, 5, 'old', offset=2) | |
_add_word(bounded_board_state, Dimension.Y, 7, 'guana', offset=7) | |
_add_word(bounded_board_state, Dimension.X, 11, 'guana', offset=3) | |
_add_word(bounded_board_state, Dimension.X, 9, 'annie', offset=7) | |
BOARD_STATE = bounded_board_state | |
for (i_idx, i) in enumerate(BOARD_STATE): | |
sys.stdout.write('{:2d} | '.format(i_idx)) | |
for (j_idx, j) in enumerate(i): | |
sys.stdout.write('{} '.format(j)) | |
sys.stdout.write('\n') | |
search_space = _get_filled_vects(BOARD_STATE) | |
l = DwylLexicon('C:/Users/r/Downloads/words.txt') | |
d = LexiconDiscovery(l) | |
results = {} | |
for (i_idx, (dimension, n, v)) in enumerate(search_space): | |
results[hash(dimension) + n] = d.discover(v, merged=False) | |
print('{}/{}'.format((i_idx + 1), len(search_space))) | |
print(results[hash(Dimension.Y) + 4]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment