dlebech/keras_embedding_onehot.py

## keras_embedding_onehot.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Create a Keras embedding layer with an initial one-hot encoding by using identity initializer
import tensorflow as tf
import numpy as np

# Input sequence consisting of four features (e.g. words)
# Let's pretend this is "hello world hello everyone else"
# Where hello is then mapped to 1, world = 0, everyone = 2, else = 3,
a = np.array([[1, 0, 1, 2, 3]])

# Since the vector consists of five words, the input layer has a shape of 5
inp = tf.keras.layers.Input(shape=(5,))

# There are four unique words so the input dimension is 4
# We want the embedding to one-hot so the output dimension is _also_ 4
# The length of the input is 5, as explained above
# Using the "identity" initializer will put 1's where the word appears and 0's elsewhere
emb = tf.keras.layers.Embedding(input_dim=4, output_dim=4, input_length=5, embeddings_initializer='identity')(inp)

# The model just puts things together.
model = tf.keras.models.Model(inputs=[inp], outputs=[emb])

# A model prediction here is just running the input through the embedding layer
# With the initial weights (which are identity)
print(model.predict(a))

# Prints:
# [[[0. 1. 0. 0.]    # hello
#   [1. 0. 0. 0.]    # world
#   [0. 1. 0. 0.]    # hello
#   [0. 0. 1. 0.]    # everyone
#   [0. 0. 0. 1.]]]  # else

# One-hot encoded using embedding layer!
# Note that this one-hot encoding does not stay intact during training,
# but in practice, I found that the number stay close to 0 and 1 respectively.
# Also, if you _need_ true one-hot encoding, then the embedding layer is probably not appropriate always.
# Good discussion: https://github.com/keras-team/keras/issues/4838#issuecomment-269138502

## keras_seq2seq_rnn.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Sequence to sequence prediction using encoder-decoder network, I guess.
# Inspired by https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
import tensorflow as tf
import numpy as np

# Overfit on the same sentence for illustration purposes.
# Interestingly, it seems to require more than one input for this to work.
data = [
    'All work and no play makes Jack a dull boy',
    'All work and no play makes Jack a dull boy',
    'All work and no play makes Jack a dull boy',
    'All work and no play makes Jack a dull boy'
]

# Map texts to integer sequences.
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data)
sequences = np.array(tokenizer.texts_to_sequences(data))

# The output needs to be 3D for last Dense layer. This is still a bit weird to me.
sequence_out = sequences.reshape((sequences.shape[0], sequences.shape[1], 1))

# The number of features needs to be set to the number of words + 1 since 0 is not used by the tokenizer.
num_tokens = len(tokenizer.word_index) + 1

# Encoder with embedding layer. The output dimension of 8 is arbitrary
inp = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(input_dim=num_tokens, output_dim=8)(inp)
_, state = tf.keras.layers.GRU(32, return_state=True)(x)

# Decoder
# I don't quite understand the initial_state and return_sequences yet.
inp2 = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(input_dim=num_tokens, output_dim=8)(inp2)
x = tf.keras.layers.GRU(32, return_sequences=True)(x, initial_state=state)
outp = tf.keras.layers.Dense(num_tokens, activation='softmax')(x)

# Creating and compiling is straightforward
model = tf.keras.models.Model(inputs=[inp, inp2], outputs=[outp])
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)
model.summary()

# Usually achieves 100% accuracy after about 20 epochs.
model.fit([sequences, sequences], sequence_out, batch_size=1, epochs=20, callbacks=[
    tf.keras.callbacks.TerminateOnNaN(),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2),
])

rev = {v: k for k,v in tokenizer.word_index.items()}
print(rev)

def print_prediction(sequences):
    p = model.predict([sequences, sequences])
    for i, sentence in enumerate(np.argmax(p, axis=1) + 1):
        print(i, sentence)
        print(' '.join(rev.get(int(word), 'N/A') for word in sentence))
        print()

# This correctly prints the original sentences
print_prediction(sequences)

# This usually prints the original sentence, even though we use the same word (8) for the entire sentences
print_prediction([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]])

## keras_seq2word_rnn.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Word prediction using recurrent neural network
import numpy as np
import tensorflow as tf

# Overfit on the same sentence for illustration purposes.
data = [
    'All work and no play makes Jack a dull boy',
]

# Map texts to integer sequences.
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data)
sequences = np.array(tokenizer.texts_to_sequences(data))

# Create input-output pairs, two input word one output word
seq_length = 2

# The number of features needs to be set to the number of words + 1 since 0 is not used by the tokenizer.
num_words = len(tokenizer.word_index) + 1

X, y = [], []
for sequence in sequences:
    for i in range(seq_length, len(sequence)):
        X.append(sequence[i-seq_length:i])
        y.append(sequence[i])

# Make categorical outputs to please the network.
# It's also possible to _not_ do this and use sparse_categorical_crossentropy in the output.
# That could potentially use less memory, I guess...
y = tf.keras.utils.to_categorical(y, num_classes=num_words)

# Embedding -> gru -> dense -> dense
inp = tf.keras.layers.Input(shape=(seq_length,))
x = tf.keras.layers.Embedding(input_dim=num_words, output_dim=5, input_length=seq_length)(inp)
x = tf.keras.layers.GRU(16)(x)
x = tf.keras.layers.Dense(16)(x)
outp = tf.keras.layers.Dense(num_words, activation='softmax')(x)

model = tf.keras.models.Model(inputs=[inp], outputs=[outp])
model.compile(
    loss='categorical_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)
model.summary()

# This fit has widely different results, sometimes reaching
# less than 50% accuracy after 50 epochs and other times reaching 100% accuracy.
model.fit(np.array(X), np.array(y), batch_size=1, epochs=50, callbacks=[
    tf.keras.callbacks.TerminateOnNaN(),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3),
])

rev = {v: k for k,v in tokenizer.word_index.items()}
print(rev)

def print_prediction(sequences):
    p = model.predict(sequences)
    words = np.argmax(p, axis=1)
    for i, word in enumerate(words):
        seq = ' '.join(rev.get(word) for word in sequences[i])
        print(i, 'Input:', '"{}"'.format(seq), 'Next word:', '"{}"'.format(rev.get(word, 'N/A')))
        print()

# Should hopefully print the correct next word for each of these sentences
# I.e. "and", "makes", "dull", "boy"
print_prediction(np.array(tokenizer.texts_to_sequences(['All work', 'no play', 'jack a', 'a dull'])))

## keras_xor_functional.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Use Keras functional model to train the XOR function
import keras.utils
from keras.models import Model
from keras.layers import Input, Dense
import numpy as np

# Define the functional layers
inputs = Input(shape=(2,))
dense = Dense(32, activation='relu')(inputs)
predictions = Dense(2, activation='softmax')(dense)

# Compile the model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Generate XOR training data
x_train = np.random.randint(2, size=(1000, 2))
y_train_raw = np.logical_xor(x_train[:,0], x_train[:,1]).reshape(1000, 1)
y_train = keras.utils.to_categorical(y_train_raw, num_classes=2)
x_test = np.random.randint(2, size=(10, 2))
y_test_raw = np.logical_xor(x_test[:,0], x_test[:,1]).reshape(10, 1)
y_test = keras.utils.to_categorical(y_test_raw, num_classes=2)

# Train the model, iterating on the data in batches of 32 samples
model.fit(x_train, y_train, epochs=10, batch_size=32)

# Test the model
print('Evalutation score: ', model.evaluate(x_test, y_test))
print('Input data ', x_test)
print('Predictions: ', model.predict(x_test))

## keras_xor_sequential.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Use Keras sequential model to train the XOR function
import keras.utils
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=2))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Generate XOR training data
x_train = np.random.randint(2, size=(1000, 2))
y_train_raw = np.logical_xor(x_train[:,0], x_train[:,1]).reshape(1000, 1)
y_train = keras.utils.to_categorical(y_train_raw, num_classes=2)
x_test = np.random.randint(2, size=(10, 2))
y_test_raw = np.logical_xor(x_test[:,0], x_test[:,1]).reshape(10, 1)
y_test = keras.utils.to_categorical(y_test_raw, num_classes=2)

# Train the model, iterating on the data in batches of 32 samples
model.fit(x_train, y_train, epochs=10, batch_size=32)

# Test the model
print('Evalutation score: ', model.evaluate(x_test, y_test))
print('Input data ', x_test)
print('Predictions: ', model.predict(x_test))
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	# Create a Keras embedding layer with an initial one-hot encoding by using identity initializer
	import tensorflow as tf
	import numpy as np

	# Input sequence consisting of four features (e.g. words)
	# Let's pretend this is "hello world hello everyone else"
	# Where hello is then mapped to 1, world = 0, everyone = 2, else = 3,
	a = np.array([[1, 0, 1, 2, 3]])

	# Since the vector consists of five words, the input layer has a shape of 5
	inp = tf.keras.layers.Input(shape=(5,))

	# There are four unique words so the input dimension is 4
	# We want the embedding to one-hot so the output dimension is _also_ 4
	# The length of the input is 5, as explained above
	# Using the "identity" initializer will put 1's where the word appears and 0's elsewhere
	emb = tf.keras.layers.Embedding(input_dim=4, output_dim=4, input_length=5, embeddings_initializer='identity')(inp)

	# The model just puts things together.
	model = tf.keras.models.Model(inputs=[inp], outputs=[emb])

	# A model prediction here is just running the input through the embedding layer
	# With the initial weights (which are identity)
	print(model.predict(a))

	# Prints:
	# [[[0. 1. 0. 0.] # hello
	# [1. 0. 0. 0.] # world
	# [0. 1. 0. 0.] # hello
	# [0. 0. 1. 0.] # everyone
	# [0. 0. 0. 1.]]] # else

	# One-hot encoded using embedding layer!
	# Note that this one-hot encoding does not stay intact during training,
	# but in practice, I found that the number stay close to 0 and 1 respectively.
	# Also, if you _need_ true one-hot encoding, then the embedding layer is probably not appropriate always.
	# Good discussion: https://github.com/keras-team/keras/issues/4838#issuecomment-269138502
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	# Sequence to sequence prediction using encoder-decoder network, I guess.
	# Inspired by https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
	import tensorflow as tf
	import numpy as np

	# Overfit on the same sentence for illustration purposes.
	# Interestingly, it seems to require more than one input for this to work.
	data = [
	'All work and no play makes Jack a dull boy',
	'All work and no play makes Jack a dull boy',
	'All work and no play makes Jack a dull boy',
	'All work and no play makes Jack a dull boy'
	]

	# Map texts to integer sequences.
	tokenizer = tf.keras.preprocessing.text.Tokenizer()
	tokenizer.fit_on_texts(data)
	sequences = np.array(tokenizer.texts_to_sequences(data))

	# The output needs to be 3D for last Dense layer. This is still a bit weird to me.
	sequence_out = sequences.reshape((sequences.shape[0], sequences.shape[1], 1))

	# The number of features needs to be set to the number of words + 1 since 0 is not used by the tokenizer.
	num_tokens = len(tokenizer.word_index) + 1

	# Encoder with embedding layer. The output dimension of 8 is arbitrary
	inp = tf.keras.layers.Input(shape=(None,))
	x = tf.keras.layers.Embedding(input_dim=num_tokens, output_dim=8)(inp)
	_, state = tf.keras.layers.GRU(32, return_state=True)(x)

	# Decoder
	# I don't quite understand the initial_state and return_sequences yet.
	inp2 = tf.keras.layers.Input(shape=(None,))
	x = tf.keras.layers.Embedding(input_dim=num_tokens, output_dim=8)(inp2)
	x = tf.keras.layers.GRU(32, return_sequences=True)(x, initial_state=state)
	outp = tf.keras.layers.Dense(num_tokens, activation='softmax')(x)

	# Creating and compiling is straightforward
	model = tf.keras.models.Model(inputs=[inp, inp2], outputs=[outp])
	model.compile(
	loss='sparse_categorical_crossentropy',
	optimizer='rmsprop',
	metrics=['accuracy']
	)
	model.summary()

	# Usually achieves 100% accuracy after about 20 epochs.
	model.fit([sequences, sequences], sequence_out, batch_size=1, epochs=20, callbacks=[
	tf.keras.callbacks.TerminateOnNaN(),
	tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2),
	])

	rev = {v: k for k,v in tokenizer.word_index.items()}
	print(rev)

	def print_prediction(sequences):
	p = model.predict([sequences, sequences])
	for i, sentence in enumerate(np.argmax(p, axis=1) + 1):
	print(i, sentence)
	print(' '.join(rev.get(int(word), 'N/A') for word in sentence))
	print()

	# This correctly prints the original sentences
	print_prediction(sequences)

	# This usually prints the original sentence, even though we use the same word (8) for the entire sentences
	print_prediction([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]])
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	# Word prediction using recurrent neural network
	import numpy as np
	import tensorflow as tf

	# Overfit on the same sentence for illustration purposes.
	data = [
	'All work and no play makes Jack a dull boy',
	]

	# Map texts to integer sequences.
	tokenizer = tf.keras.preprocessing.text.Tokenizer()
	tokenizer.fit_on_texts(data)
	sequences = np.array(tokenizer.texts_to_sequences(data))

	# Create input-output pairs, two input word one output word
	seq_length = 2

	# The number of features needs to be set to the number of words + 1 since 0 is not used by the tokenizer.
	num_words = len(tokenizer.word_index) + 1

	X, y = [], []
	for sequence in sequences:
	for i in range(seq_length, len(sequence)):
	X.append(sequence[i-seq_length:i])
	y.append(sequence[i])

	# Make categorical outputs to please the network.
	# It's also possible to _not_ do this and use sparse_categorical_crossentropy in the output.
	# That could potentially use less memory, I guess...
	y = tf.keras.utils.to_categorical(y, num_classes=num_words)

	# Embedding -> gru -> dense -> dense
	inp = tf.keras.layers.Input(shape=(seq_length,))
	x = tf.keras.layers.Embedding(input_dim=num_words, output_dim=5, input_length=seq_length)(inp)
	x = tf.keras.layers.GRU(16)(x)
	x = tf.keras.layers.Dense(16)(x)
	outp = tf.keras.layers.Dense(num_words, activation='softmax')(x)

	model = tf.keras.models.Model(inputs=[inp], outputs=[outp])
	model.compile(
	loss='categorical_crossentropy',
	optimizer='rmsprop',
	metrics=['accuracy']
	)
	model.summary()

	# This fit has widely different results, sometimes reaching
	# less than 50% accuracy after 50 epochs and other times reaching 100% accuracy.
	model.fit(np.array(X), np.array(y), batch_size=1, epochs=50, callbacks=[
	tf.keras.callbacks.TerminateOnNaN(),
	tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3),
	])

	rev = {v: k for k,v in tokenizer.word_index.items()}
	print(rev)

	def print_prediction(sequences):
	p = model.predict(sequences)
	words = np.argmax(p, axis=1)
	for i, word in enumerate(words):
	seq = ' '.join(rev.get(word) for word in sequences[i])
	print(i, 'Input:', '"{}"'.format(seq), 'Next word:', '"{}"'.format(rev.get(word, 'N/A')))
	print()

	# Should hopefully print the correct next word for each of these sentences
	# I.e. "and", "makes", "dull", "boy"
	print_prediction(np.array(tokenizer.texts_to_sequences(['All work', 'no play', 'jack a', 'a dull'])))
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	# Use Keras functional model to train the XOR function
	import keras.utils
	from keras.models import Model
	from keras.layers import Input, Dense
	import numpy as np

	# Define the functional layers
	inputs = Input(shape=(2,))
	dense = Dense(32, activation='relu')(inputs)
	predictions = Dense(2, activation='softmax')(dense)

	# Compile the model
	model = Model(inputs=inputs, outputs=predictions)
	model.compile(optimizer='rmsprop',
	loss='categorical_crossentropy',
	metrics=['accuracy'])

	# Generate XOR training data
	x_train = np.random.randint(2, size=(1000, 2))
	y_train_raw = np.logical_xor(x_train[:,0], x_train[:,1]).reshape(1000, 1)
	y_train = keras.utils.to_categorical(y_train_raw, num_classes=2)
	x_test = np.random.randint(2, size=(10, 2))
	y_test_raw = np.logical_xor(x_test[:,0], x_test[:,1]).reshape(10, 1)
	y_test = keras.utils.to_categorical(y_test_raw, num_classes=2)

	# Train the model, iterating on the data in batches of 32 samples
	model.fit(x_train, y_train, epochs=10, batch_size=32)

	# Test the model
	print('Evalutation score: ', model.evaluate(x_test, y_test))
	print('Input data ', x_test)
	print('Predictions: ', model.predict(x_test))
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	# Use Keras sequential model to train the XOR function
	import keras.utils
	from keras.models import Sequential
	from keras.layers import Dense
	import numpy as np

	model = Sequential()
	model.add(Dense(32, activation='relu', input_dim=2))
	model.add(Dense(2, activation='softmax'))
	model.compile(optimizer='rmsprop',
	loss='categorical_crossentropy',
	metrics=['accuracy'])

	# Generate XOR training data
	x_train = np.random.randint(2, size=(1000, 2))
	y_train_raw = np.logical_xor(x_train[:,0], x_train[:,1]).reshape(1000, 1)
	y_train = keras.utils.to_categorical(y_train_raw, num_classes=2)
	x_test = np.random.randint(2, size=(10, 2))
	y_test_raw = np.logical_xor(x_test[:,0], x_test[:,1]).reshape(10, 1)
	y_test = keras.utils.to_categorical(y_test_raw, num_classes=2)

	# Train the model, iterating on the data in batches of 32 samples
	model.fit(x_train, y_train, epochs=10, batch_size=32)

	# Test the model
	print('Evalutation score: ', model.evaluate(x_test, y_test))
	print('Input data ', x_test)
	print('Predictions: ', model.predict(x_test))