zmjjmz/ml_utils.py

## ml_utils.py
 class TokenizeLookupLayer(keras.layers.Layer):
     """
     Layer that encapsulates the following:
     - Tokenizing sentences by space (or given delimiter)
     - Looking up the words with a given vocabulary list / table
     - Resetting the shape of the above to be batch_size x pad_len (using dark magic)
     # Input Shape
       2D string tensor with shape `(batch_size, 1)`
     # Output Shape
       2D int32 tensor with shape `(batch_size, pad_len)`
     """

     def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
         super(TokenizeLookupLayer, self).__init__(**kwargs)
         self.input_spec = keras.engine.InputSpec(
             ndim=2, dtype='string')

         self.pad_len = pad_len
         self.pad_value = pad_value
         self.oov_value = oov_value
         self.word_ind_map = word_ind_map

     def get_config(self):
         config = {
             'word_ind_map': self.word_ind_map,
             'pad_len': self.pad_len,
             'pad_value': self.pad_value,
             'oov_value': self.oov_value,
         }
         base_config = super(TokenizeLookupLayer, self).get_config()
         config.update(base_config)
         return config

     def build(self, input_shape):
         self.lookup_tab = tensorflow.contrib.lookup.HashTable(
             tensorflow.contrib.lookup.KeyValueTensorInitializer(
                 *zip(*self.word_ind_map.iteritems())),
             default_value=self.oov_value)
         try:
             tensorflow.tables_initializer().run(session=keras.backend.get_session())
         except tensorflow.errors.FailedPreconditionError:
             #TODO(ZJ) this is probably wrong?: DS-209
             pass

         super(TokenizeLookupLayer, self).build(input_shape)

     def call(self, str_inp):
         # no name supported for this op?!
         tokenized_inp = tensorflow.string_split(
             tensorflow.squeeze(str_inp, axis=1))

         sparse_inp_lookedup = self.lookup_tab.lookup(
             tokenized_inp,
             name='lookup'
         )
         # this could be batch_size x max_seq_len_in_batch
         # and max_seq_len_in_batch bears no relation to pad_len, but we need to
         # get it out in pad_len
         dense_inp = tensorflow.sparse_tensor_to_dense(
             sparse_inp_lookedup,
             default_value=self.pad_value,
             name='dense'
         )

         # So essentially: add 0s to the end up to pad_len
         # pad
         pad_full = tensorflow.pad(
             dense_inp,
             paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
             #paddings=tensorflow.constant([[0, self.pad_len]]),
             mode='CONSTANT',
             constant_values=self.pad_value,
             name='pad'
         )

         # Then limit the second dimension to pad_len
         # slice
         sliced = pad_full[:, :self.pad_len]
         return sliced

     def compute_output_shape(self, input_shape):
         # return (input_shape[0], self.pad_len)
         return (input_shape[0], self.pad_len,)
	class TokenizeLookupLayer(keras.layers.Layer):
	"""
	Layer that encapsulates the following:
	- Tokenizing sentences by space (or given delimiter)
	- Looking up the words with a given vocabulary list / table
	- Resetting the shape of the above to be batch_size x pad_len (using dark magic)
	# Input Shape
	2D string tensor with shape `(batch_size, 1)`
	# Output Shape
	2D int32 tensor with shape `(batch_size, pad_len)`
	"""

	def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
	super(TokenizeLookupLayer, self).__init__(**kwargs)
	self.input_spec = keras.engine.InputSpec(
	ndim=2, dtype='string')

	self.pad_len = pad_len
	self.pad_value = pad_value
	self.oov_value = oov_value
	self.word_ind_map = word_ind_map

	def get_config(self):
	config = {
	'word_ind_map': self.word_ind_map,
	'pad_len': self.pad_len,
	'pad_value': self.pad_value,
	'oov_value': self.oov_value,
	}
	base_config = super(TokenizeLookupLayer, self).get_config()
	config.update(base_config)
	return config

	def build(self, input_shape):
	self.lookup_tab = tensorflow.contrib.lookup.HashTable(
	tensorflow.contrib.lookup.KeyValueTensorInitializer(
	zip(self.word_ind_map.iteritems())),
	default_value=self.oov_value)
	try:
	tensorflow.tables_initializer().run(session=keras.backend.get_session())
	except tensorflow.errors.FailedPreconditionError:
	#TODO(ZJ) this is probably wrong?: DS-209
	pass

	super(TokenizeLookupLayer, self).build(input_shape)

	def call(self, str_inp):
	# no name supported for this op?!
	tokenized_inp = tensorflow.string_split(
	tensorflow.squeeze(str_inp, axis=1))

	sparse_inp_lookedup = self.lookup_tab.lookup(
	tokenized_inp,
	name='lookup'
	)
	# this could be batch_size x max_seq_len_in_batch
	# and max_seq_len_in_batch bears no relation to pad_len, but we need to
	# get it out in pad_len
	dense_inp = tensorflow.sparse_tensor_to_dense(
	sparse_inp_lookedup,
	default_value=self.pad_value,
	name='dense'
	)

	# So essentially: add 0s to the end up to pad_len
	# pad
	pad_full = tensorflow.pad(
	dense_inp,
	paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
	#paddings=tensorflow.constant([[0, self.pad_len]]),
	mode='CONSTANT',
	constant_values=self.pad_value,
	name='pad'
	)

	# Then limit the second dimension to pad_len
	# slice
	sliced = pad_full[:, :self.pad_len]
	return sliced

	def compute_output_shape(self, input_shape):
	# return (input_shape[0], self.pad_len)
	return (input_shape[0], self.pad_len,)