Skip to content

Instantly share code, notes, and snippets.

@shivamsaboo17
Created March 10, 2018 16:46
Show Gist options
  • Save shivamsaboo17/d2b7d3bc8f42af2fe7f6063168c8f164 to your computer and use it in GitHub Desktop.
Save shivamsaboo17/d2b7d3bc8f42af2fe7f6063168c8f164 to your computer and use it in GitHub Desktop.
A basic skip gram model for vector representation of words implemented in Tensorflow.
import numpy as np
import tensorflow as tf
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib.pyplot as plt
corpus_raw = 'He is the king . The king is royal . She is the royal queen . The queen loves king'
# Convert the corpus to lower case
corpus_raw = corpus_raw.lower()
"""
Convert the input corpus to input output pair,
such that if we input a word we get N neighbouring words
as the output. N is also called as the window size. This
is known as the skip-gram model
"""
# Convert all the words to integers using dictionaries
words = []
for word in corpus_raw.split():
# . is ignored in the model as it should not be treated as a word
if word != '.':
words.append(word)
words = set(words) # Removal of duplicates
word2int = {}
int2word = {}
vocab_size = len(words)
for i, word in enumerate(words):
word2int[word] = i
int2word[i] = word
# Generate list of sentences as list of words
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
sentences.append(sentence.split())
# Generate the training data
data = []
window_size = 2
# Take list of sentence [[w1, w2, w3], [w1, w2, w3]...]
for sentence in sentences:
# Take a sentence [w1, w2, w3..]
for word_index, word, in enumerate(sentence):
# word_index, word = (i, wi)
for nb_word in sentence[max(word_index - window_size, 0): min(word_index + window_size, len(sentence)) + 1]:
# The word is the center word
# nb_words is window of the words of the given window size
if nb_word != word:
# Don't take center word in it's own window
data.append([word, nb_word])
# data is in form [[center word, [window surrounding the word]]...]
# The training data must now be represented as numbers
# We use the word2int dictionary for this purpose
# Then we convert them to one hot vectors
def to_one_hot_vectors(data_point_index, vocab_size):
temp = np.zeros(vocab_size)
temp[data_point_index] = 1
return temp
x_train = []
y_train = []
for data_words in data:
# The x_train consists of one hot representations of the center word
x_train.append(to_one_hot_vectors(word2int[data_words[0]], vocab_size))
# The y_train consists of one hot representations of the surrounding words for the center word
y_train.append(to_one_hot_vectors(word2int[data_words[1]], vocab_size))
# Convert the final training data to numpy array
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
# Make the tensorflow model
# First make placeholders for x_train and y_train
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))
# Number of dimensions we use to represent the word vector
embedding_dimension = 5
# w1 = [[vector rep of word1], [vector rep of word2]...]
w1 = tf.Variable(tf.random_normal([vocab_size, embedding_dimension]))
b1 = tf.Variable(tf.random_normal([embedding_dimension])) # bias
# The hidden_representation gives vector representation of an input word given it's one hot
hidden_representation = tf.add(tf.matmul(x, w1), b1)
w2 = tf.Variable(tf.random_normal([embedding_dimension, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
# The final layer is a softmax which gives us the probabilities which we then compare
# with the ground truth to back-propagate the gradients
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, w2), b2))
# Now we train the model
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# We use a simple cross entropy loss function
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# Define the training steps
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iterations = 10000
# Train for 10000 epochs
for _ in range(n_iterations):
sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
print("Loss is", sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))
# The w1 + bias is the vector representation of each word
vectors = sess.run(tf.add(w1, b1))
print(vectors)
# Plotting the learnt vector representations
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)
# Normalize the reduced vectors for plotting in matplotlib
normalizer = preprocessing.Normalizer()
vectors = normalizer.fit_transform(vectors, '12')
# Time to print word clusters!!
fig, ax = plt.subplots()
for word in words:
print(word, vectors[word2int[word]][1])
ax.annotate(word, (vectors[word2int[word]][0], vectors[word2int[word]][1]))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment