Skip to content

Instantly share code, notes, and snippets.

@sherwoac
Created September 9, 2020 16:24
Show Gist options
  • Save sherwoac/2ebe44199d777070fd7bcdce308b469f to your computer and use it in GitHub Desktop.
Save sherwoac/2ebe44199d777070fd7bcdce308b469f to your computer and use it in GitHub Desktop.
Embedding clustering vs embedding size - KERAS, TSNE
import numpy as np
import tensorflow as tf
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib import cm
def explore_embedding_size(number_of_categories, embedding_sizes):
fig = plt.figure(figsize=(15, 9))
figure_title = f"Embedding size TSNEs for {number_of_categories} categories"
fig.suptitle(figure_title, fontsize=14, y=0.08)
# plt.title(figure_title, y=1.08)
rows = len(embedding_sizes) // 3 + 1
cols = max(min(1, len(embedding_sizes) // 3), 3)
for i, embedding_size in enumerate(embedding_sizes):
colors = cm.get_cmap('jet')(np.linspace(0., 1., number_of_categories))
input_embeddings = get_simple_embedding(number_of_categories, embedding_size)
X_embedded = TSNE(n_components=2).fit_transform(input_embeddings)
ax = fig.add_subplot(rows, cols, i + 1)
ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=colors)
ax.set_title(f"embedding size: {embedding_size}", pad=20)
plt.tight_layout()
plt.show()
def get_simple_embedding(number_of_categories, embedding_size):
columns_of_categories = 1
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(number_of_categories, embedding_size, input_length=columns_of_categories))
input_array = np.arange(0, number_of_categories, dtype=np.int32)
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
output_array = output_array.transpose(0, 2, 1)
output_array = output_array.squeeze()
return output_array
if __name__ == '__main__':
number_of_categories = 5600
max_power_of_2 = 8
explore_embedding_size(number_of_categories, [2**(i+1) for i in range(max_power_of_2 - 1)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment