Skip to content

Instantly share code, notes, and snippets.

@VXU1230
VXU1230 / s1.py
Last active April 18, 2019 20:02
Import libraries, Create tools for data cleaning, Define parameters
import os
import time
import numpy as np
from collections import defaultdict, OrderedDict
from multiprocessing import Pool
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import string
@VXU1230
VXU1230 / s2.py
Last active March 19, 2019 17:44
data = tfds.load("imdb_reviews", with_info=False)
train_data, test_data = data['train'], data['test']
np_train_data = tfds.as_numpy(train_data)
np_test_data = tfds.as_numpy(test_data)
@VXU1230
VXU1230 / s3.py
Last active April 18, 2019 20:04
def clean_sentence(dic):
sent = dic["text"].decode("utf-8").lower().translate(EXCLUDE_TRANS)
words = [LEMMA.lemmatize(word, POS_DIC[pos_tag([word])[0][1]]) for word in sent.split()
if word.isalpha()]
return words
NUM_WORKERS = 20
def clean_batch(batch_rows):
with Pool(processes=NUM_WORKERS) as pool:
batch = pool.map(clean_sentence, batch_rows, chunksize=200)
@VXU1230
VXU1230 / s4.py
Last active March 19, 2019 17:54
create vocabulary
PERCENT_L = 10
PERCENT_U = 90
LOG_DIR = os.getcwd()
WARM_START = False
def build_vocab(data):
if not WARM_START:
dic = defaultdict(int)
for sent in data:
for word in sent:
@VXU1230
VXU1230 / s5.py
Last active March 19, 2019 17:55
WINDOW_SIZE = 5
NEGATIVE_SAMPLES = 10
def process_sent(sent):
sampling = keras.preprocessing.sequence.make_sampling_table(VOCAB_SIZE)
couples, labels = keras.preprocessing.sequence.skipgrams(
sequence=sent, vocabulary_size=VOCAB_SIZE, window_size=WINDOW_SIZE,
negative_samples=NEGATIVE_SAMPLES, shuffle=True, sampling_table=sampling
)
target, context = zip(*couples)
@VXU1230
VXU1230 / s6.py
Last active March 19, 2019 18:08
Create Data Pipeline
def next_batch(train_input, training=True):
target_data = np.hstack(train_input[:, 0]).astype(np.float32)
context_data = np.hstack(train_input[:, 1]).astype(np.float32)
label_data = np.hstack(train_input[:, 2]).astype(np.float32)
word_size = target_data.size // BATCH_SIZE * BATCH_SIZE
epoch = 1
counter = 0
while True:
t_batch = target_data[counter:counter + BATCH_SIZE]
c_batch = context_data[counter:counter + BATCH_SIZE]
@VXU1230
VXU1230 / s7.py
Last active March 19, 2019 23:00
build model
class MyModel(tf.keras.Model):
def __init__(self, vocab_size, embed_size):
super(MyModel, self).__init__()
self.target_inputs = layers.Input((1,))
self.context_inputs = layers.Input((1,))
self.embedding = layers.Embedding(
vocab_size,
embed_size,
embeddings_initializer=tf.keras.initializers.glorot_normal(),
name='embedding')
@VXU1230
VXU1230 / s9.py
Last active March 19, 2019 18:04
Evaluate Model
def evaluate(model, loss_fn, target_data, context_data, label_data):
predictions = model(target_data, context_data)
batch_loss = loss_fn(label_data, predictions)
return batch_loss
@VXU1230
VXU1230 / s10.py
Last active March 19, 2019 22:58
Calculate Word Similarity
NUM_SIM = 5
def get_similarity(sim_examples, embed_weights):
norm = tf.sqrt(tf.reduce_sum(tf.square(embed_weights), 1, keepdims=True))
norm_embed_matrix = embed_weights / norm
valid_embed = tf.nn.embedding_lookup(norm_embed_matrix, sim_examples)
sim_matrix = tf.matmul(valid_embed, tf.transpose(norm_embed_matrix))
return sim_matrix
def print_eval(valid_examples, sim_matrix, reverse_dic):
@VXU1230
VXU1230 / s8.py
Created March 19, 2019 18:04
train a batch
@tf.function
def train_step(model, loss_fn, optimizer, target, context, label):
with tf.GradientTape() as tape:
predictions = model(target, context)
batch_loss = loss_fn(label, predictions)
gradients = tape.gradient(batch_loss, model.trainable_variables)
c_gradients = [tf.clip_by_value(g, -5., 5.) for g in gradients if g is not None]
optimizer.apply_gradients(zip(c_gradients, model.trainable_variables))
g2 = 0
for g in c_gradients: