shivamsaboo17/word2vec_tf.py

## word2vec_tf.py
import numpy as np
import tensorflow as tf
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib.pyplot as plt

corpus_raw = 'He is the king . The king is royal . She is the royal  queen . The queen loves king'
# Convert the corpus to lower case
corpus_raw = corpus_raw.lower()

"""
Convert the input corpus to input output pair,
such that if we input a word we get N neighbouring words
as the output. N is also called as the window size. This
is known as the skip-gram model

"""

# Convert all the words to integers using dictionaries
words = []
for word in corpus_raw.split():
    # . is ignored in the model as it should not be treated as a word
    if word != '.':
        words.append(word)

words = set(words)  # Removal of duplicates

word2int = {}
int2word = {}

vocab_size = len(words)

for i, word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

# Generate list of sentences as list of words
raw_sentences = corpus_raw.split('.')
sentences = []

for sentence in raw_sentences:
    sentences.append(sentence.split())

# Generate the training data
data = []
window_size = 2

# Take list of sentence [[w1, w2, w3], [w1, w2, w3]...]
for sentence in sentences:
    # Take a sentence [w1, w2, w3..]
    for word_index, word, in enumerate(sentence):
        # word_index, word = (i, wi)
        for nb_word in sentence[max(word_index - window_size, 0): min(word_index + window_size, len(sentence)) + 1]:
            # The word is the center word
            # nb_words is window of the words of the given window size
            if nb_word != word:
                # Don't take center word in it's own window
                data.append([word, nb_word])
                # data is in form [[center word, [window surrounding the word]]...]

# The training data must now be represented as numbers
# We use the word2int dictionary for this purpose
# Then we convert them to one hot vectors


def to_one_hot_vectors(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = []
y_train = []

for data_words in data:
    # The x_train consists of one hot representations of the center word
    x_train.append(to_one_hot_vectors(word2int[data_words[0]], vocab_size))
    # The y_train consists of one hot representations of the surrounding words for the center word
    y_train.append(to_one_hot_vectors(word2int[data_words[1]], vocab_size))

# Convert the final training data to numpy array
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

# Make the tensorflow model
# First make placeholders for x_train and y_train

x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

# Number of dimensions we use to represent the word vector
embedding_dimension = 5
# w1 = [[vector rep of word1], [vector rep of word2]...]
w1 = tf.Variable(tf.random_normal([vocab_size, embedding_dimension]))
b1 = tf.Variable(tf.random_normal([embedding_dimension]))  # bias
# The hidden_representation gives vector representation of an input word given it's one hot
hidden_representation = tf.add(tf.matmul(x, w1), b1)

w2 = tf.Variable(tf.random_normal([embedding_dimension, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
# The final layer is a softmax which gives us the probabilities which we then compare
# with the ground truth to back-propagate the gradients
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, w2), b2))

# Now we train the model
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# We use a simple cross entropy loss function
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

# Define the training steps
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iterations = 10000

# Train for 10000 epochs
for _ in range(n_iterations):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print("Loss is", sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

# The w1 + bias is the vector representation of each word
vectors = sess.run(tf.add(w1, b1))
print(vectors)

# Plotting the learnt vector representations
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

# Normalize the reduced vectors for plotting in matplotlib
normalizer = preprocessing.Normalizer()
vectors = normalizer.fit_transform(vectors, '12')

# Time to print word clusters!!
fig, ax = plt.subplots()
for word in words:
    print(word, vectors[word2int[word]][1])
    ax.annotate(word, (vectors[word2int[word]][0], vectors[word2int[word]][1]))

plt.show()
	import numpy as np
	import tensorflow as tf
	from sklearn.manifold import TSNE
	from sklearn import preprocessing
	import matplotlib.pyplot as plt

	corpus_raw = 'He is the king . The king is royal . She is the royal queen . The queen loves king'
	# Convert the corpus to lower case
	corpus_raw = corpus_raw.lower()

	"""
	Convert the input corpus to input output pair,
	such that if we input a word we get N neighbouring words
	as the output. N is also called as the window size. This
	is known as the skip-gram model

	"""

	# Convert all the words to integers using dictionaries
	words = []
	for word in corpus_raw.split():
	# . is ignored in the model as it should not be treated as a word
	if word != '.':
	words.append(word)

	words = set(words) # Removal of duplicates

	word2int = {}
	int2word = {}

	vocab_size = len(words)

	for i, word in enumerate(words):
	word2int[word] = i
	int2word[i] = word

	# Generate list of sentences as list of words
	raw_sentences = corpus_raw.split('.')
	sentences = []

	for sentence in raw_sentences:
	sentences.append(sentence.split())

	# Generate the training data
	data = []
	window_size = 2

	# Take list of sentence [[w1, w2, w3], [w1, w2, w3]...]
	for sentence in sentences:
	# Take a sentence [w1, w2, w3..]
	for word_index, word, in enumerate(sentence):
	# word_index, word = (i, wi)
	for nb_word in sentence[max(word_index - window_size, 0): min(word_index + window_size, len(sentence)) + 1]:
	# The word is the center word
	# nb_words is window of the words of the given window size
	if nb_word != word:
	# Don't take center word in it's own window
	data.append([word, nb_word])
	# data is in form [[center word, [window surrounding the word]]...]

	# The training data must now be represented as numbers
	# We use the word2int dictionary for this purpose
	# Then we convert them to one hot vectors


	def to_one_hot_vectors(data_point_index, vocab_size):
	temp = np.zeros(vocab_size)
	temp[data_point_index] = 1
	return temp

	x_train = []
	y_train = []

	for data_words in data:
	# The x_train consists of one hot representations of the center word
	x_train.append(to_one_hot_vectors(word2int[data_words[0]], vocab_size))
	# The y_train consists of one hot representations of the surrounding words for the center word
	y_train.append(to_one_hot_vectors(word2int[data_words[1]], vocab_size))

	# Convert the final training data to numpy array
	x_train = np.asarray(x_train)
	y_train = np.asarray(y_train)

	# Make the tensorflow model
	# First make placeholders for x_train and y_train

	x = tf.placeholder(tf.float32, shape=(None, vocab_size))
	y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

	# Number of dimensions we use to represent the word vector
	embedding_dimension = 5
	# w1 = [[vector rep of word1], [vector rep of word2]...]
	w1 = tf.Variable(tf.random_normal([vocab_size, embedding_dimension]))
	b1 = tf.Variable(tf.random_normal([embedding_dimension])) # bias
	# The hidden_representation gives vector representation of an input word given it's one hot
	hidden_representation = tf.add(tf.matmul(x, w1), b1)

	w2 = tf.Variable(tf.random_normal([embedding_dimension, vocab_size]))
	b2 = tf.Variable(tf.random_normal([vocab_size]))
	# The final layer is a softmax which gives us the probabilities which we then compare
	# with the ground truth to back-propagate the gradients
	prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, w2), b2))

	# Now we train the model
	sess = tf.Session()
	init = tf.global_variables_initializer()
	sess.run(init)
	# We use a simple cross entropy loss function
	cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

	# Define the training steps
	train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
	n_iterations = 10000

	# Train for 10000 epochs
	for _ in range(n_iterations):
	sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
	print("Loss is", sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

	# The w1 + bias is the vector representation of each word
	vectors = sess.run(tf.add(w1, b1))
	print(vectors)

	# Plotting the learnt vector representations
	model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	vectors = model.fit_transform(vectors)

	# Normalize the reduced vectors for plotting in matplotlib
	normalizer = preprocessing.Normalizer()
	vectors = normalizer.fit_transform(vectors, '12')

	# Time to print word clusters!!
	fig, ax = plt.subplots()
	for word in words:
	print(word, vectors[word2int[word]][1])
	ax.annotate(word, (vectors[word2int[word]][0], vectors[word2int[word]][1]))

	plt.show()