soaxelbrooke/ml_utils.py

## ml_utils.py
import keras
import tensorflow
import numpy
import re

# Capturing group is important so it can be left padded with space (token splitter)
token_pattern = r"([\w']+|[,\.\?;\-\(\)])"
substitution = r" \1"


class TokenizeLookupLayer(keras.layers.Layer):
    """
     Layer that encapsulates the following:
     - Tokenizing sentences by space (or given delimiter)
     - Looking up the words with a given vocabulary list / table
     - Resetting the shape of the above to be batch_size x pad_len (using dark magic)
     # Input Shape
       2D string tensor with shape `(batch_size, 1)`
     # Output Shape
       2D int32 tensor with shape `(batch_size, pad_len)`
     """

    def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
        super(TokenizeLookupLayer, self).__init__(**kwargs)
        self.input_spec = keras.layers.InputSpec(ndim=2, dtype="string")

        self.pad_len = pad_len
        self.pad_value = pad_value
        self.oov_value = oov_value
        self.word_ind_map = word_ind_map
        self.mapping = tensorflow.constant(['__empty__', '__oov__', *word_ind_map.keys()])
        self.table = tensorflow.contrib.lookup.index_table_from_tensor(
            mapping=self.mapping, default_value=1)

    def get_config(self):
        config = {
            "word_ind_map": self.word_ind_map,
            "pad_len": self.pad_len,
            "pad_value": self.pad_value,
            "oov_value": self.oov_value,
        }
        base_config = super(TokenizeLookupLayer, self).get_config()
        config.update(base_config)
        return config

    def build(self, input_shape):
        try:
            tensorflow.tables_initializer().run(session=keras.backend.get_session())
        except tensorflow.errors.FailedPreconditionError:
            # TODO(ZJ) this is probably wrong?: DS-209
            pass

        super(TokenizeLookupLayer, self).build(input_shape)

    def call(self, str_inp):
        # no name supported for this op?!
        replaced_inp = tensorflow.regex_replace(str_inp, r"([\w']+|[,\.\?;\-\(\)])", r' \1')
        tokenized_inp = tensorflow.string_split(
            tensorflow.squeeze(replaced_inp, axis=1)
        )

        sparse_inp_lookedup = self.table.lookup(tokenized_inp, name="lookup")
        # this could be batch_size x max_seq_len_in_batch
        # and max_seq_len_in_batch bears no relation to pad_len, but we need to
        # get it out in pad_len
        dense_inp = tensorflow.sparse_tensor_to_dense(
            sparse_inp_lookedup, default_value=self.pad_value, name="dense"
        )

        # So essentially: add 0s to the end up to pad_len
        # pad
        pad_full = tensorflow.pad(
            dense_inp,
            paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
            # paddings=tensorflow.constant([[0, self.pad_len]]),
            mode="CONSTANT",
            constant_values=self.pad_value,
            name="pad",
        )

        # Then limit the second dimension to pad_len
        # slice
        sliced = pad_full[:, : self.pad_len]
        return sliced

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.pad_len)


layer = TokenizeLookupLayer({"cats": 2, "dogs": 3, "you're": 4, "okay": 5, ".": 6}, 10)
inp = keras.layers.Input((1,), dtype=tensorflow.string)
outp = layer(inp)
model = keras.models.Model(input=inp, output=outp)

model.predict(numpy.array(["dogs, cats and dogs dogs dogs you're okay."]))
# Result: array([[3, 1, 2, 1, 3, 3, 3, 4, 5, 6]], dtype=int32)
	import keras
	import tensorflow
	import numpy
	import re

	# Capturing group is important so it can be left padded with space (token splitter)
	token_pattern = r"([\w']+\|[,\.\?;\-\(\)])"
	substitution = r" \1"


	class TokenizeLookupLayer(keras.layers.Layer):
	"""
	Layer that encapsulates the following:
	- Tokenizing sentences by space (or given delimiter)
	- Looking up the words with a given vocabulary list / table
	- Resetting the shape of the above to be batch_size x pad_len (using dark magic)
	# Input Shape
	2D string tensor with shape `(batch_size, 1)`
	# Output Shape
	2D int32 tensor with shape `(batch_size, pad_len)`
	"""

	def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
	super(TokenizeLookupLayer, self).__init__(**kwargs)
	self.input_spec = keras.layers.InputSpec(ndim=2, dtype="string")

	self.pad_len = pad_len
	self.pad_value = pad_value
	self.oov_value = oov_value
	self.word_ind_map = word_ind_map
	self.mapping = tensorflow.constant(['__empty__', '__oov__', *word_ind_map.keys()])
	self.table = tensorflow.contrib.lookup.index_table_from_tensor(
	mapping=self.mapping, default_value=1)

	def get_config(self):
	config = {
	"word_ind_map": self.word_ind_map,
	"pad_len": self.pad_len,
	"pad_value": self.pad_value,
	"oov_value": self.oov_value,
	}
	base_config = super(TokenizeLookupLayer, self).get_config()
	config.update(base_config)
	return config

	def build(self, input_shape):
	try:
	tensorflow.tables_initializer().run(session=keras.backend.get_session())
	except tensorflow.errors.FailedPreconditionError:
	# TODO(ZJ) this is probably wrong?: DS-209
	pass

	super(TokenizeLookupLayer, self).build(input_shape)

	def call(self, str_inp):
	# no name supported for this op?!
	replaced_inp = tensorflow.regex_replace(str_inp, r"([\w']+\|[,\.\?;\-\(\)])", r' \1')
	tokenized_inp = tensorflow.string_split(
	tensorflow.squeeze(replaced_inp, axis=1)
	)

	sparse_inp_lookedup = self.table.lookup(tokenized_inp, name="lookup")
	# this could be batch_size x max_seq_len_in_batch
	# and max_seq_len_in_batch bears no relation to pad_len, but we need to
	# get it out in pad_len
	dense_inp = tensorflow.sparse_tensor_to_dense(
	sparse_inp_lookedup, default_value=self.pad_value, name="dense"
	)

	# So essentially: add 0s to the end up to pad_len
	# pad
	pad_full = tensorflow.pad(
	dense_inp,
	paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
	# paddings=tensorflow.constant([[0, self.pad_len]]),
	mode="CONSTANT",
	constant_values=self.pad_value,
	name="pad",
	)

	# Then limit the second dimension to pad_len
	# slice
	sliced = pad_full[:, : self.pad_len]
	return sliced

	def compute_output_shape(self, input_shape):
	return (input_shape[0], self.pad_len)


	layer = TokenizeLookupLayer({"cats": 2, "dogs": 3, "you're": 4, "okay": 5, ".": 6}, 10)
	inp = keras.layers.Input((1,), dtype=tensorflow.string)
	outp = layer(inp)
	model = keras.models.Model(input=inp, output=outp)

	model.predict(numpy.array(["dogs, cats and dogs dogs dogs you're okay."]))
	# Result: array([[3, 1, 2, 1, 3, 3, 3, 4, 5, 6]], dtype=int32)