DerekChia/w2v_training.py

## w2v_training.py
# Training
w2v.train(training_data)

class word2vec():
  def train(self, training_data):
    # Initialising weight matrices
    # Both s1 and s2 should be randomly initialised but for this demo, we pre-determine the arrays (getW1 and getW2)
    # getW1 - shape (9x10) and getW2 - shape (10x9)
    self.w1 = np.array(getW1)
    self.w2 = np.array(getW2)
    # self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
    # self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

    # Cycle through each epoch
    for i in range(self.epochs):
      # Intialise to 0 loss
      self.loss = 0

      # Cycle through each training sample
      # w_t = vector for target word, w_c = vectors for context words
      for w_t, w_c in training_data:
        # Forward pass - Pass in vector for target word (w_t) to get:
        # 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
        y_pred, h, u = self.forward_pass(w_t)

        # Calculate error
        # 1. For a target word, calculate difference between y_pred and each of the context words
        # 2. Sum up the differences using np.sum to give us the error for this particular target word
        EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

        # Backpropagation
        # We use SGD to backpropagate errors - calculate loss on the output layer
        self.backprop(EI, h, w_t)

        # Calculate loss
        # There are 2 parts to the loss function
        # Part 1: -ve sum of all the output +
        # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
        # Note: word.index(1) returns the index in the context word vector with value 1
        # Note: u[word.index(1)] returns the value of the output layer before softmax
        self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
      print('Epoch:', i, "Loss:", self.loss)

  def forward_pass(self, x):
    # x is one-hot vector for target word, shape - 9x1
    # Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
    h = np.dot(self.w1.T, x)
    # Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
    u = np.dot(self.w2.T, h)
    # Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
    y_c = self.softmax(u)
    return y_c, h, u

  def softmax(self, x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

  def backprop(self, e, h, x):
    # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
    # Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
    # Going backwards, we need to take derivative of E with respect of w2
    # h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
    dl_dw2 = np.outer(h, e)
    # x - shape 1x8, w2 - 5x8, e.T - 8x1
    # x - 1x8, np.dot() - 5x1, dl_dw1 - 8x5
    dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
    # Update weights
    self.w1 = self.w1 - (self.lr * dl_dw1)
    self.w2 = self.w2 - (self.lr * dl_dw2)
	# Training
	w2v.train(training_data)

	class word2vec():
	def train(self, training_data):
	# Initialising weight matrices
	# Both s1 and s2 should be randomly initialised but for this demo, we pre-determine the arrays (getW1 and getW2)
	# getW1 - shape (9x10) and getW2 - shape (10x9)
	self.w1 = np.array(getW1)
	self.w2 = np.array(getW2)
	# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
	# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

	# Cycle through each epoch
	for i in range(self.epochs):
	# Intialise to 0 loss
	self.loss = 0

	# Cycle through each training sample
	# w_t = vector for target word, w_c = vectors for context words
	for w_t, w_c in training_data:
	# Forward pass - Pass in vector for target word (w_t) to get:
	# 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
	y_pred, h, u = self.forward_pass(w_t)

	# Calculate error
	# 1. For a target word, calculate difference between y_pred and each of the context words
	# 2. Sum up the differences using np.sum to give us the error for this particular target word
	EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

	# Backpropagation
	# We use SGD to backpropagate errors - calculate loss on the output layer
	self.backprop(EI, h, w_t)

	# Calculate loss
	# There are 2 parts to the loss function
	# Part 1: -ve sum of all the output +
	# Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
	# Note: word.index(1) returns the index in the context word vector with value 1
	# Note: u[word.index(1)] returns the value of the output layer before softmax
	self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
	print('Epoch:', i, "Loss:", self.loss)

	def forward_pass(self, x):
	# x is one-hot vector for target word, shape - 9x1
	# Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
	h = np.dot(self.w1.T, x)
	# Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
	u = np.dot(self.w2.T, h)
	# Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
	y_c = self.softmax(u)
	return y_c, h, u

	def softmax(self, x):
	e_x = np.exp(x - np.max(x))
	return e_x / e_x.sum(axis=0)

	def backprop(self, e, h, x):
	# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
	# Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
	# Going backwards, we need to take derivative of E with respect of w2
	# h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
	dl_dw2 = np.outer(h, e)
	# x - shape 1x8, w2 - 5x8, e.T - 8x1
	# x - 1x8, np.dot() - 5x1, dl_dw1 - 8x5
	dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
	# Update weights
	self.w1 = self.w1 - (self.lr * dl_dw1)
	self.w2 = self.w2 - (self.lr * dl_dw2)