Skip to content

Instantly share code, notes, and snippets.

@DerekChia
Last active December 1, 2018 11:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DerekChia/e5dd085191eedce8b5d036ba0f995719 to your computer and use it in GitHub Desktop.
Save DerekChia/e5dd085191eedce8b5d036ba0f995719 to your computer and use it in GitHub Desktop.
w2v_generate_training_data_func
class word2vec():
def __init__(self):
self.n = settings['n']
self.lr = settings['learning_rate']
self.epochs = settings['epochs']
self.window = settings['window_size']
def generate_training_data(self, settings, corpus):
# Find unique word counts using dictonary
word_counts = defaultdict(int)
for row in corpus:
for word in row:
word_counts[word] += 1
## How many unique words in vocab? 9
self.v_count = len(word_counts.keys())
# Generate Lookup Dictionaries (vocab)
self.words_list = list(word_counts.keys())
# Generate word:index
self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
# Generate index:word
self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
training_data = []
# Cycle through each sentence in corpus
for sentence in corpus:
sent_len = len(sentence)
# Cycle through each word in sentence
for i, word in enumerate(sentence):
# Convert target word to one-hot
w_target = self.word2onehot(sentence[i])
# Cycle through context window
w_context = []
# Note: window_size 2 will have range of 5 values
for j in range(i - self.window, i + self.window+1):
# Criteria for context word
# 1. Target word cannot be context word (j != i)
# 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
# 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range
if j != i and j <= sent_len-1 and j >= 0:
# Append the one-hot representation of word to w_context
w_context.append(self.word2onehot(sentence[j]))
# print(sentence[i], sentence[j])
# training_data contains a one-hot representation of the target word and context words
training_data.append([w_target, w_context])
return np.array(training_data)
def word2onehot(self, word):
# word_vec - initialise a blank vector
word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)
# Get ID of word from word_index
word_index = self.word_index[word]
# Change value from 0 to 1 according to ID of the word
word_vec[word_index] = 1
return word_vec
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment