Skip to content

Instantly share code, notes, and snippets.

@soaxelbrooke
Forked from zmjjmz/ml_utils.py
Last active October 13, 2018 23:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soaxelbrooke/246959a7290313fb22be021d9c82a394 to your computer and use it in GitHub Desktop.
Save soaxelbrooke/246959a7290313fb22be021d9c82a394 to your computer and use it in GitHub Desktop.
regexp match lookup layer
import keras
import tensorflow
import numpy
import re
# Capturing group is important so it can be left padded with space (token splitter)
token_pattern = r"([\w']+|[,\.\?;\-\(\)])"
substitution = r" \1"
class TokenizeLookupLayer(keras.layers.Layer):
"""
Layer that encapsulates the following:
- Tokenizing sentences by space (or given delimiter)
- Looking up the words with a given vocabulary list / table
- Resetting the shape of the above to be batch_size x pad_len (using dark magic)
# Input Shape
2D string tensor with shape `(batch_size, 1)`
# Output Shape
2D int32 tensor with shape `(batch_size, pad_len)`
"""
def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
super(TokenizeLookupLayer, self).__init__(**kwargs)
self.input_spec = keras.layers.InputSpec(ndim=2, dtype="string")
self.pad_len = pad_len
self.pad_value = pad_value
self.oov_value = oov_value
self.word_ind_map = word_ind_map
self.mapping = tensorflow.constant(['__empty__', '__oov__', *word_ind_map.keys()])
self.table = tensorflow.contrib.lookup.index_table_from_tensor(
mapping=self.mapping, default_value=1)
def get_config(self):
config = {
"word_ind_map": self.word_ind_map,
"pad_len": self.pad_len,
"pad_value": self.pad_value,
"oov_value": self.oov_value,
}
base_config = super(TokenizeLookupLayer, self).get_config()
config.update(base_config)
return config
def build(self, input_shape):
try:
tensorflow.tables_initializer().run(session=keras.backend.get_session())
except tensorflow.errors.FailedPreconditionError:
# TODO(ZJ) this is probably wrong?: DS-209
pass
super(TokenizeLookupLayer, self).build(input_shape)
def call(self, str_inp):
# no name supported for this op?!
replaced_inp = tensorflow.regex_replace(str_inp, r"([\w']+|[,\.\?;\-\(\)])", r' \1')
tokenized_inp = tensorflow.string_split(
tensorflow.squeeze(replaced_inp, axis=1)
)
sparse_inp_lookedup = self.table.lookup(tokenized_inp, name="lookup")
# this could be batch_size x max_seq_len_in_batch
# and max_seq_len_in_batch bears no relation to pad_len, but we need to
# get it out in pad_len
dense_inp = tensorflow.sparse_tensor_to_dense(
sparse_inp_lookedup, default_value=self.pad_value, name="dense"
)
# So essentially: add 0s to the end up to pad_len
# pad
pad_full = tensorflow.pad(
dense_inp,
paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
# paddings=tensorflow.constant([[0, self.pad_len]]),
mode="CONSTANT",
constant_values=self.pad_value,
name="pad",
)
# Then limit the second dimension to pad_len
# slice
sliced = pad_full[:, : self.pad_len]
return sliced
def compute_output_shape(self, input_shape):
return (input_shape[0], self.pad_len)
layer = TokenizeLookupLayer({"cats": 2, "dogs": 3, "you're": 4, "okay": 5, ".": 6}, 10)
inp = keras.layers.Input((1,), dtype=tensorflow.string)
outp = layer(inp)
model = keras.models.Model(input=inp, output=outp)
model.predict(numpy.array(["dogs, cats and dogs dogs dogs you're okay."]))
# Result: array([[3, 1, 2, 1, 3, 3, 3, 4, 5, 6]], dtype=int32)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment