Created
March 10, 2018 16:46
-
-
Save shivamsaboo17/d2b7d3bc8f42af2fe7f6063168c8f164 to your computer and use it in GitHub Desktop.
A basic skip gram model for vector representation of words implemented in Tensorflow.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import tensorflow as tf | |
from sklearn.manifold import TSNE | |
from sklearn import preprocessing | |
import matplotlib.pyplot as plt | |
corpus_raw = 'He is the king . The king is royal . She is the royal queen . The queen loves king' | |
# Convert the corpus to lower case | |
corpus_raw = corpus_raw.lower() | |
""" | |
Convert the input corpus to input output pair, | |
such that if we input a word we get N neighbouring words | |
as the output. N is also called as the window size. This | |
is known as the skip-gram model | |
""" | |
# Convert all the words to integers using dictionaries | |
words = [] | |
for word in corpus_raw.split(): | |
# . is ignored in the model as it should not be treated as a word | |
if word != '.': | |
words.append(word) | |
words = set(words) # Removal of duplicates | |
word2int = {} | |
int2word = {} | |
vocab_size = len(words) | |
for i, word in enumerate(words): | |
word2int[word] = i | |
int2word[i] = word | |
# Generate list of sentences as list of words | |
raw_sentences = corpus_raw.split('.') | |
sentences = [] | |
for sentence in raw_sentences: | |
sentences.append(sentence.split()) | |
# Generate the training data | |
data = [] | |
window_size = 2 | |
# Take list of sentence [[w1, w2, w3], [w1, w2, w3]...] | |
for sentence in sentences: | |
# Take a sentence [w1, w2, w3..] | |
for word_index, word, in enumerate(sentence): | |
# word_index, word = (i, wi) | |
for nb_word in sentence[max(word_index - window_size, 0): min(word_index + window_size, len(sentence)) + 1]: | |
# The word is the center word | |
# nb_words is window of the words of the given window size | |
if nb_word != word: | |
# Don't take center word in it's own window | |
data.append([word, nb_word]) | |
# data is in form [[center word, [window surrounding the word]]...] | |
# The training data must now be represented as numbers | |
# We use the word2int dictionary for this purpose | |
# Then we convert them to one hot vectors | |
def to_one_hot_vectors(data_point_index, vocab_size): | |
temp = np.zeros(vocab_size) | |
temp[data_point_index] = 1 | |
return temp | |
x_train = [] | |
y_train = [] | |
for data_words in data: | |
# The x_train consists of one hot representations of the center word | |
x_train.append(to_one_hot_vectors(word2int[data_words[0]], vocab_size)) | |
# The y_train consists of one hot representations of the surrounding words for the center word | |
y_train.append(to_one_hot_vectors(word2int[data_words[1]], vocab_size)) | |
# Convert the final training data to numpy array | |
x_train = np.asarray(x_train) | |
y_train = np.asarray(y_train) | |
# Make the tensorflow model | |
# First make placeholders for x_train and y_train | |
x = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
# Number of dimensions we use to represent the word vector | |
embedding_dimension = 5 | |
# w1 = [[vector rep of word1], [vector rep of word2]...] | |
w1 = tf.Variable(tf.random_normal([vocab_size, embedding_dimension])) | |
b1 = tf.Variable(tf.random_normal([embedding_dimension])) # bias | |
# The hidden_representation gives vector representation of an input word given it's one hot | |
hidden_representation = tf.add(tf.matmul(x, w1), b1) | |
w2 = tf.Variable(tf.random_normal([embedding_dimension, vocab_size])) | |
b2 = tf.Variable(tf.random_normal([vocab_size])) | |
# The final layer is a softmax which gives us the probabilities which we then compare | |
# with the ground truth to back-propagate the gradients | |
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, w2), b2)) | |
# Now we train the model | |
sess = tf.Session() | |
init = tf.global_variables_initializer() | |
sess.run(init) | |
# We use a simple cross entropy loss function | |
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1])) | |
# Define the training steps | |
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss) | |
n_iterations = 10000 | |
# Train for 10000 epochs | |
for _ in range(n_iterations): | |
sess.run(train_step, feed_dict={x: x_train, y_label: y_train}) | |
print("Loss is", sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train})) | |
# The w1 + bias is the vector representation of each word | |
vectors = sess.run(tf.add(w1, b1)) | |
print(vectors) | |
# Plotting the learnt vector representations | |
model = TSNE(n_components=2, random_state=0) | |
np.set_printoptions(suppress=True) | |
vectors = model.fit_transform(vectors) | |
# Normalize the reduced vectors for plotting in matplotlib | |
normalizer = preprocessing.Normalizer() | |
vectors = normalizer.fit_transform(vectors, '12') | |
# Time to print word clusters!! | |
fig, ax = plt.subplots() | |
for word in words: | |
print(word, vectors[word2int[word]][1]) | |
ax.annotate(word, (vectors[word2int[word]][0], vectors[word2int[word]][1])) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment