Created
February 25, 2017 17:38
-
-
Save fogside/7dd2020c275786bab605aa92f3877f36 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import compress | |
data_index = 0 | |
def generate_batch_cbow(batch_size, num_skips, skip_window): | |
''' | |
Batch generator for CBOW (Continuous Bag of Words) | |
batch should be a shape of (batch_size, num_skips) | |
Parameters | |
---------- | |
batch_size: number of words in each mini-batch | |
num_skips: number of surrounding words on both direction (2: one word ahead and one word following) | |
skip_window: number of words at both ends of a sentence to skip (1: skip the first and last word of a sentence) | |
''' | |
global data_index | |
assert batch_size % num_skips == 0 | |
assert num_skips <= 2 * skip_window | |
batch = np.ndarray(shape=(batch_size, num_skips), dtype=np.int32) | |
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) | |
span = 2 * skip_window + 1 # [ skip_window target skip_window ] | |
buffer = collections.deque(maxlen=span) # used for collecting data[data_index] in the sliding window | |
# collect the first window of words | |
for _ in range(span): | |
buffer.append(data[data_index]) | |
data_index = (data_index + 1) % len(data) | |
# move the sliding window | |
for i in range(batch_size): | |
mask = [1] * span | |
mask[skip_window] = 0 | |
batch[i, :] = list(compress(buffer, mask)) # all surrounding words | |
labels[i, 0] = buffer[skip_window] # the word at the center | |
buffer.append(data[data_index]) | |
data_index = (data_index + 1) % len(data) | |
return batch, labels | |
batch_size = 128 | |
embedding_size = 128 # Dimension of the embedding vector. | |
skip_window = 1 # How many words to consider left and right. | |
num_skips = 2 # How many times to reuse an input to generate a label. | |
# We pick a random validation set to sample nearest neighbors. here we limit the | |
# validation samples to the words that have a low numeric ID, which by | |
# construction are also the most frequent. | |
valid_size = 16 # Random set of words to evaluate similarity on. | |
valid_window = 100 # Only pick dev samples in the head of the distribution. | |
valid_examples = np.array(random.sample(range(valid_window), valid_size)) | |
num_sampled = 64 # Number of negative examples to sample. | |
graph = tf.Graph() | |
with graph.as_default(), tf.device('/cpu:0'): | |
# Input data. | |
train_dataset = tf.placeholder(tf.int32, shape=[batch_size, num_skips]) ## added num_skips as a second dimension | |
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) | |
valid_dataset = tf.constant(valid_examples, dtype=tf.int32) | |
# Variables. | |
embeddings = tf.Variable( | |
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) | |
softmax_weights = tf.Variable( | |
tf.truncated_normal([vocabulary_size, embedding_size], | |
stddev=1.0 / math.sqrt(embedding_size))) | |
softmax_biases = tf.Variable(tf.zeros([vocabulary_size])) | |
# Model. | |
# Look up embeddings for inputs. | |
embed = tf.zeros([batch_size, embedding_size]) | |
for j in range(num_skips): | |
embed += tf.nn.embedding_lookup(embeddings, train_dataset[:, j]) | |
num_steps = 100001 | |
with tf.Session(graph=graph) as session: | |
tf.global_variables_initializer().run() | |
print('Initialized') | |
average_loss = 0 | |
for step in range(num_steps): | |
batch_data, batch_labels = generate_batch_cbow(batch_size, num_skips, skip_window) | |
feed_dict = {train_dataset : batch_data, train_labels : batch_labels} | |
_, l = session.run([optimizer, loss], feed_dict=feed_dict) | |
average_loss += l | |
if step % 2000 == 0: | |
if step > 0: | |
average_loss = average_loss / 2000 | |
# The average loss is an estimate of the loss over the last 2000 batches. | |
print('Average loss at step %d: %f' % (step, average_loss)) | |
average_loss = 0 | |
# note that this is expensive (~20% slowdown if computed every 500 steps) | |
if step % 10000 == 0: | |
sim = similarity.eval() | |
for i in range(valid_size): | |
valid_word = reverse_dictionary[valid_examples[i]] | |
top_k = 8 # number of nearest neighbors | |
nearest = (-sim[i, :]).argsort()[1:top_k+1] | |
log = 'Nearest to %s:' % valid_word | |
for k in range(top_k): | |
close_word = reverse_dictionary[nearest[k]] | |
log = '%s %s,' % (log, close_word) | |
print(log) | |
final_embeddings = normalized_embeddings.eval() | |
# Compute the softmax loss, using a sample of the negative labels each time. | |
loss = tf.reduce_mean( | |
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed, | |
labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size)) | |
# Optimizer. | |
# Note: The optimizer will optimize the softmax_weights AND the embeddings. | |
# This is because the embeddings are defined as a variable quantity and the | |
# optimizer's `minimize` method will by default modify all variable quantities | |
# that contribute to the tensor it is passed. | |
# See docs on `tf.train.Optimizer.minimize()` for more details. | |
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) | |
# Compute the similarity between minibatch examples and all embeddings. | |
# We use the cosine distance: | |
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) | |
normalized_embeddings = embeddings / norm | |
valid_embeddings = tf.nn.embedding_lookup( | |
normalized_embeddings, valid_dataset) | |
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment