zmjjmz/tf2_upgrade_test.py

## tf2_upgrade_test.py
import itertools

import numpy
import tensorflow

class TokenizeLookupLayer(tensorflow.keras.layers.Layer):
    """
    Layer that encapsulates the following:
    - Tokenizing sentences by space (or given delimiter)
    - Looking up the words with a given vocabulary list / table
    - Resetting the shape of the above to be batch_size x pad_len (using dark magic)

    # Input Shape
        2D string tensor with shape `(batch_size, 1)`
    # Output Shape
        2D int32 tensor with shape `(batch_size, pad_len)`
    """

    def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
        super(TokenizeLookupLayer, self).__init__(**kwargs)
        self.input_spec = tensorflow.keras.layers.InputSpec(
            ndim=2, dtype=tensorflow.string)

        self.pad_len = pad_len
        self.pad_value = pad_value
        self.oov_value = oov_value
        self.word_ind_map = word_ind_map

    def get_config(self):
        config = {
            'word_ind_map': self.word_ind_map,
            'pad_len': self.pad_len,
            'pad_value': self.pad_value,
            'oov_value': self.oov_value,
        }
        base_config = super(TokenizeLookupLayer, self).get_config()
        config.update(base_config)
        return config

    def build(self, input_shape):
        # UNCAUGHT
        self.lookup_tab = tensorflow.contrib.lookup.HashTable(
            tensorflow.contrib.lookup.KeyValueTensorInitializer(
        #self.lookup_tab = tensorflow.lookup.StaticHashTable(
        #    tensorflow.lookup.KeyValueTensorInitializer(
                *zip(*self.word_ind_map.items())),
            default_value=self.oov_value)

        super(TokenizeLookupLayer, self).build(input_shape)

    def call(self, str_inp):
        # no name supported for this op?!
        tokenized_inp = tensorflow.string_split(
            tensorflow.squeeze(str_inp, axis=1))

        sparse_inp_lookedup = self.lookup_tab.lookup(
            tokenized_inp,
            name='lookup'
        )
        # this could be batch_size x max_seq_len_in_batch
        # and max_seq_len_in_batch bears no relation to pad_len, but we need to
        # get it out in pad_len
        dense_inp = tensorflow.sparse_tensor_to_dense(
        #dense_inp = tensorflow.sparse.to_dense(
            sparse_inp_lookedup,
            default_value=self.pad_value,
            name='dense'
        )

        # So essentially: add 0s to the end up to pad_len
        # pad
        pad_full = tensorflow.pad(
            dense_inp,
            paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
            #paddings=tensorflow.constant([[0, self.pad_len]]),
            mode='CONSTANT',
            constant_values=self.pad_value,
            name='pad'
        )

        # Then limit the second dimension to pad_len
        # slice
        sliced = pad_full[:, :self.pad_len]
        return sliced

    def compute_output_shape(self, input_shape):
        # return (input_shape[0], self.pad_len)
        return (input_shape[0], self.pad_len,)

def test_TokenizeLookupLayer():
    word_ind_map = {w: ind + 1 for ind, w in enumerate('abcdefghijk')}
    n_examples = 1000
    pad_len = 162
    #batch_size = 32

    word_iterator = itertools.cycle(word_ind_map.keys())
    strs = [
        ' '.join(sorted([
            i for i in
            [next(word_iterator) for _ in
                 range(numpy.random.randint(1, high=pad_len + 100))]
        ], key=lambda x: numpy.random.rand())) for _ in range(n_examples)
    ]

    str_inp = tensorflow.keras.layers.Input(shape=(1,), dtype='string')
    looked_up = TokenizeLookupLayer(word_ind_map, pad_len)(str_inp)
    tokenize_lookup_model = tensorflow.keras.models.Model(
        inputs=str_inp, outputs=looked_up)
    #train_args = {'batch_size':32, 'epochs':1}

    compile_kwargs = {
        "optimizer": "sgd",
        "loss": "mean_squared_error",
        "metrics": []
    }

    # tokenize_lookup_model.compile(**compile_kwargs)
    with tensorflow.Session() as sess:
        tensorflow.tables_initializer().run(session=sess)
        strs_looked_up = sess.run(looked_up, feed_dict={
                                  str_inp: numpy.expand_dims(strs, axis=1)})
    print(strs_looked_up)
    print(strs_looked_up.shape)


if __name__ == "__main__":
    test_TokenizeLookupLayer()

## tf_upgrade_report.txt
TensorFlow 2.0 Upgrade Script
-----------------------------
Converted 1 files
Detected 0 issues that require attention
--------------------------------------------------------------------------------
================================================================================
Detailed log follows:

================================================================================
--------------------------------------------------------------------------------
Processing file 'tf2_upgrade_test.py'
 outputting to 'tf2_upgrade_test_upgraded.py'
--------------------------------------------------------------------------------


--------------------------------------------------------------------------------
	import itertools

	import numpy
	import tensorflow

	class TokenizeLookupLayer(tensorflow.keras.layers.Layer):
	"""
	Layer that encapsulates the following:
	- Tokenizing sentences by space (or given delimiter)
	- Looking up the words with a given vocabulary list / table
	- Resetting the shape of the above to be batch_size x pad_len (using dark magic)

	# Input Shape
	2D string tensor with shape `(batch_size, 1)`
	# Output Shape
	2D int32 tensor with shape `(batch_size, pad_len)`
	"""

	def __init__(self, word_ind_map, pad_len, pad_value=0, oov_value=1, **kwargs):
	super(TokenizeLookupLayer, self).__init__(**kwargs)
	self.input_spec = tensorflow.keras.layers.InputSpec(
	ndim=2, dtype=tensorflow.string)

	self.pad_len = pad_len
	self.pad_value = pad_value
	self.oov_value = oov_value
	self.word_ind_map = word_ind_map

	def get_config(self):
	config = {
	'word_ind_map': self.word_ind_map,
	'pad_len': self.pad_len,
	'pad_value': self.pad_value,
	'oov_value': self.oov_value,
	}
	base_config = super(TokenizeLookupLayer, self).get_config()
	config.update(base_config)
	return config

	def build(self, input_shape):
	# UNCAUGHT
	self.lookup_tab = tensorflow.contrib.lookup.HashTable(
	tensorflow.contrib.lookup.KeyValueTensorInitializer(
	#self.lookup_tab = tensorflow.lookup.StaticHashTable(
	# tensorflow.lookup.KeyValueTensorInitializer(
	zip(self.word_ind_map.items())),
	default_value=self.oov_value)

	super(TokenizeLookupLayer, self).build(input_shape)

	def call(self, str_inp):
	# no name supported for this op?!
	tokenized_inp = tensorflow.string_split(
	tensorflow.squeeze(str_inp, axis=1))

	sparse_inp_lookedup = self.lookup_tab.lookup(
	tokenized_inp,
	name='lookup'
	)
	# this could be batch_size x max_seq_len_in_batch
	# and max_seq_len_in_batch bears no relation to pad_len, but we need to
	# get it out in pad_len
	dense_inp = tensorflow.sparse_tensor_to_dense(
	#dense_inp = tensorflow.sparse.to_dense(
	sparse_inp_lookedup,
	default_value=self.pad_value,
	name='dense'
	)

	# So essentially: add 0s to the end up to pad_len
	# pad
	pad_full = tensorflow.pad(
	dense_inp,
	paddings=tensorflow.constant([[0, 0], [0, self.pad_len]]),
	#paddings=tensorflow.constant([[0, self.pad_len]]),
	mode='CONSTANT',
	constant_values=self.pad_value,
	name='pad'
	)

	# Then limit the second dimension to pad_len
	# slice
	sliced = pad_full[:, :self.pad_len]
	return sliced

	def compute_output_shape(self, input_shape):
	# return (input_shape[0], self.pad_len)
	return (input_shape[0], self.pad_len,)

	def test_TokenizeLookupLayer():
	word_ind_map = {w: ind + 1 for ind, w in enumerate('abcdefghijk')}
	n_examples = 1000
	pad_len = 162
	#batch_size = 32

	word_iterator = itertools.cycle(word_ind_map.keys())
	strs = [
	' '.join(sorted([
	i for i in
	[next(word_iterator) for _ in
	range(numpy.random.randint(1, high=pad_len + 100))]
	], key=lambda x: numpy.random.rand())) for _ in range(n_examples)
	]

	str_inp = tensorflow.keras.layers.Input(shape=(1,), dtype='string')
	looked_up = TokenizeLookupLayer(word_ind_map, pad_len)(str_inp)
	tokenize_lookup_model = tensorflow.keras.models.Model(
	inputs=str_inp, outputs=looked_up)
	#train_args = {'batch_size':32, 'epochs':1}

	compile_kwargs = {
	"optimizer": "sgd",
	"loss": "mean_squared_error",
	"metrics": []
	}

	# tokenize_lookup_model.compile(**compile_kwargs)
	with tensorflow.Session() as sess:
	tensorflow.tables_initializer().run(session=sess)
	strs_looked_up = sess.run(looked_up, feed_dict={
	str_inp: numpy.expand_dims(strs, axis=1)})
	print(strs_looked_up)
	print(strs_looked_up.shape)


	if __name__ == "__main__":
	test_TokenizeLookupLayer()
	TensorFlow 2.0 Upgrade Script
	-----------------------------
	Converted 1 files
	Detected 0 issues that require attention
	--------------------------------------------------------------------------------
	================================================================================
	Detailed log follows:

	================================================================================
	--------------------------------------------------------------------------------
	Processing file 'tf2_upgrade_test.py'
	outputting to 'tf2_upgrade_test_upgraded.py'
	--------------------------------------------------------------------------------


	--------------------------------------------------------------------------------