Siwei Causevic VXU1230

## s1.py
import os
import time
import numpy as np
from collections import defaultdict, OrderedDict
from multiprocessing import Pool
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import string

## s2.py
data = tfds.load("imdb_reviews", with_info=False)
train_data, test_data = data['train'], data['test']
np_train_data = tfds.as_numpy(train_data)
np_test_data = tfds.as_numpy(test_data)

## s3.py
def clean_sentence(dic):
    sent = dic["text"].decode("utf-8").lower().translate(EXCLUDE_TRANS)
    words = [LEMMA.lemmatize(word, POS_DIC[pos_tag([word])[0][1]]) for word in sent.split()
             if word.isalpha()]
    return words

NUM_WORKERS = 20
def clean_batch(batch_rows):
    with Pool(processes=NUM_WORKERS) as pool:
        batch = pool.map(clean_sentence, batch_rows, chunksize=200)

## s4.py
PERCENT_L = 10
PERCENT_U = 90
LOG_DIR = os.getcwd()
WARM_START = False

def build_vocab(data):
    if not WARM_START:
        dic = defaultdict(int)
        for sent in data:
            for word in sent:

## s5.py
WINDOW_SIZE = 5
NEGATIVE_SAMPLES = 10

def process_sent(sent):
    sampling = keras.preprocessing.sequence.make_sampling_table(VOCAB_SIZE)
    couples, labels = keras.preprocessing.sequence.skipgrams(
        sequence=sent, vocabulary_size=VOCAB_SIZE, window_size=WINDOW_SIZE,
        negative_samples=NEGATIVE_SAMPLES, shuffle=True, sampling_table=sampling
    )
      target, context = zip(*couples)

## s6.py
def next_batch(train_input, training=True):
    target_data = np.hstack(train_input[:, 0]).astype(np.float32)
    context_data = np.hstack(train_input[:, 1]).astype(np.float32)
    label_data = np.hstack(train_input[:, 2]).astype(np.float32)
    word_size = target_data.size // BATCH_SIZE * BATCH_SIZE
    epoch = 1
    counter = 0
    while True:
        t_batch = target_data[counter:counter + BATCH_SIZE]
        c_batch = context_data[counter:counter + BATCH_SIZE]

## s7.py
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embed_size):
        super(MyModel, self).__init__()
        self.target_inputs = layers.Input((1,))
        self.context_inputs = layers.Input((1,))
        self.embedding = layers.Embedding(
            vocab_size,
            embed_size,
            embeddings_initializer=tf.keras.initializers.glorot_normal(),
            name='embedding')

## s9.py
def evaluate(model, loss_fn, target_data, context_data, label_data):
    predictions = model(target_data, context_data)
    batch_loss = loss_fn(label_data, predictions)
    return batch_loss

## s10.py
NUM_SIM = 5

def get_similarity(sim_examples, embed_weights):
    norm = tf.sqrt(tf.reduce_sum(tf.square(embed_weights), 1, keepdims=True))
    norm_embed_matrix = embed_weights / norm
    valid_embed = tf.nn.embedding_lookup(norm_embed_matrix, sim_examples)
    sim_matrix = tf.matmul(valid_embed, tf.transpose(norm_embed_matrix))
    return sim_matrix

def print_eval(valid_examples, sim_matrix, reverse_dic):

## s8.py
@tf.function
def train_step(model, loss_fn, optimizer, target, context, label):
    with tf.GradientTape() as tape:
        predictions = model(target, context)
        batch_loss = loss_fn(label, predictions)
    gradients = tape.gradient(batch_loss, model.trainable_variables)
    c_gradients = [tf.clip_by_value(g, -5., 5.) for g in gradients if g is not None]
    optimizer.apply_gradients(zip(c_gradients, model.trainable_variables))
    g2 = 0
    for g in c_gradients:
	import os
	import time
	import numpy as np
	from collections import defaultdict, OrderedDict
	from multiprocessing import Pool
	import tensorflow as tf
	from tensorflow import keras
	from tensorflow.keras import layers
	import tensorflow_datasets as tfds
	import string
	data = tfds.load("imdb_reviews", with_info=False)
	train_data, test_data = data['train'], data['test']
	np_train_data = tfds.as_numpy(train_data)
	np_test_data = tfds.as_numpy(test_data)
	def clean_sentence(dic):
	sent = dic["text"].decode("utf-8").lower().translate(EXCLUDE_TRANS)
	words = [LEMMA.lemmatize(word, POS_DIC[pos_tag([word])[0][1]]) for word in sent.split()
	if word.isalpha()]
	return words

	NUM_WORKERS = 20
	def clean_batch(batch_rows):
	with Pool(processes=NUM_WORKERS) as pool:
	batch = pool.map(clean_sentence, batch_rows, chunksize=200)
	PERCENT_L = 10
	PERCENT_U = 90
	LOG_DIR = os.getcwd()
	WARM_START = False

	def build_vocab(data):
	if not WARM_START:
	dic = defaultdict(int)
	for sent in data:
	for word in sent:
	WINDOW_SIZE = 5
	NEGATIVE_SAMPLES = 10

	def process_sent(sent):
	sampling = keras.preprocessing.sequence.make_sampling_table(VOCAB_SIZE)
	couples, labels = keras.preprocessing.sequence.skipgrams(
	sequence=sent, vocabulary_size=VOCAB_SIZE, window_size=WINDOW_SIZE,
	negative_samples=NEGATIVE_SAMPLES, shuffle=True, sampling_table=sampling
	)
	target, context = zip(*couples)
	def next_batch(train_input, training=True):
	target_data = np.hstack(train_input[:, 0]).astype(np.float32)
	context_data = np.hstack(train_input[:, 1]).astype(np.float32)
	label_data = np.hstack(train_input[:, 2]).astype(np.float32)
	word_size = target_data.size // BATCH_SIZE * BATCH_SIZE
	epoch = 1
	counter = 0
	while True:
	t_batch = target_data[counter:counter + BATCH_SIZE]
	c_batch = context_data[counter:counter + BATCH_SIZE]
	class MyModel(tf.keras.Model):
	def __init__(self, vocab_size, embed_size):
	super(MyModel, self).__init__()
	self.target_inputs = layers.Input((1,))
	self.context_inputs = layers.Input((1,))
	self.embedding = layers.Embedding(
	vocab_size,
	embed_size,
	embeddings_initializer=tf.keras.initializers.glorot_normal(),
	name='embedding')
	def evaluate(model, loss_fn, target_data, context_data, label_data):
	predictions = model(target_data, context_data)
	batch_loss = loss_fn(label_data, predictions)
	return batch_loss
	NUM_SIM = 5

	def get_similarity(sim_examples, embed_weights):
	norm = tf.sqrt(tf.reduce_sum(tf.square(embed_weights), 1, keepdims=True))
	norm_embed_matrix = embed_weights / norm
	valid_embed = tf.nn.embedding_lookup(norm_embed_matrix, sim_examples)
	sim_matrix = tf.matmul(valid_embed, tf.transpose(norm_embed_matrix))
	return sim_matrix

	def print_eval(valid_examples, sim_matrix, reverse_dic):
	@tf.function
	def train_step(model, loss_fn, optimizer, target, context, label):
	with tf.GradientTape() as tape:
	predictions = model(target, context)
	batch_loss = loss_fn(label, predictions)
	gradients = tape.gradient(batch_loss, model.trainable_variables)
	c_gradients = [tf.clip_by_value(g, -5., 5.) for g in gradients if g is not None]
	optimizer.apply_gradients(zip(c_gradients, model.trainable_variables))
	g2 = 0
	for g in c_gradients: