This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import numpy as np | |
from collections import defaultdict, OrderedDict | |
from multiprocessing import Pool | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras import layers | |
import tensorflow_datasets as tfds | |
import string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = tfds.load("imdb_reviews", with_info=False) | |
train_data, test_data = data['train'], data['test'] | |
np_train_data = tfds.as_numpy(train_data) | |
np_test_data = tfds.as_numpy(test_data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_sentence(dic): | |
sent = dic["text"].decode("utf-8").lower().translate(EXCLUDE_TRANS) | |
words = [LEMMA.lemmatize(word, POS_DIC[pos_tag([word])[0][1]]) for word in sent.split() | |
if word.isalpha()] | |
return words | |
NUM_WORKERS = 20 | |
def clean_batch(batch_rows): | |
with Pool(processes=NUM_WORKERS) as pool: | |
batch = pool.map(clean_sentence, batch_rows, chunksize=200) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PERCENT_L = 10 | |
PERCENT_U = 90 | |
LOG_DIR = os.getcwd() | |
WARM_START = False | |
def build_vocab(data): | |
if not WARM_START: | |
dic = defaultdict(int) | |
for sent in data: | |
for word in sent: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WINDOW_SIZE = 5 | |
NEGATIVE_SAMPLES = 10 | |
def process_sent(sent): | |
sampling = keras.preprocessing.sequence.make_sampling_table(VOCAB_SIZE) | |
couples, labels = keras.preprocessing.sequence.skipgrams( | |
sequence=sent, vocabulary_size=VOCAB_SIZE, window_size=WINDOW_SIZE, | |
negative_samples=NEGATIVE_SAMPLES, shuffle=True, sampling_table=sampling | |
) | |
target, context = zip(*couples) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def next_batch(train_input, training=True): | |
target_data = np.hstack(train_input[:, 0]).astype(np.float32) | |
context_data = np.hstack(train_input[:, 1]).astype(np.float32) | |
label_data = np.hstack(train_input[:, 2]).astype(np.float32) | |
word_size = target_data.size // BATCH_SIZE * BATCH_SIZE | |
epoch = 1 | |
counter = 0 | |
while True: | |
t_batch = target_data[counter:counter + BATCH_SIZE] | |
c_batch = context_data[counter:counter + BATCH_SIZE] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MyModel(tf.keras.Model): | |
def __init__(self, vocab_size, embed_size): | |
super(MyModel, self).__init__() | |
self.target_inputs = layers.Input((1,)) | |
self.context_inputs = layers.Input((1,)) | |
self.embedding = layers.Embedding( | |
vocab_size, | |
embed_size, | |
embeddings_initializer=tf.keras.initializers.glorot_normal(), | |
name='embedding') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def evaluate(model, loss_fn, target_data, context_data, label_data): | |
predictions = model(target_data, context_data) | |
batch_loss = loss_fn(label_data, predictions) | |
return batch_loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NUM_SIM = 5 | |
def get_similarity(sim_examples, embed_weights): | |
norm = tf.sqrt(tf.reduce_sum(tf.square(embed_weights), 1, keepdims=True)) | |
norm_embed_matrix = embed_weights / norm | |
valid_embed = tf.nn.embedding_lookup(norm_embed_matrix, sim_examples) | |
sim_matrix = tf.matmul(valid_embed, tf.transpose(norm_embed_matrix)) | |
return sim_matrix | |
def print_eval(valid_examples, sim_matrix, reverse_dic): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@tf.function | |
def train_step(model, loss_fn, optimizer, target, context, label): | |
with tf.GradientTape() as tape: | |
predictions = model(target, context) | |
batch_loss = loss_fn(label, predictions) | |
gradients = tape.gradient(batch_loss, model.trainable_variables) | |
c_gradients = [tf.clip_by_value(g, -5., 5.) for g in gradients if g is not None] | |
optimizer.apply_gradients(zip(c_gradients, model.trainable_variables)) | |
g2 = 0 | |
for g in c_gradients: |
OlderNewer