Textual entailment training using TensorFlow.
import tensorflow as tf | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as ticker | |
import urllib | |
import sys | |
import os | |
import zipfile | |
glove_vectors_file = "glove.6B.50d.txt" | |
snli_dev_file = "snli_1.0_dev.txt" | |
snli_full_dataset_file = "snli_1.0_train.txt" | |
glove_wordmap = {} | |
with open(glove_vectors_file, "r", encoding="utf-8") as glove: | |
for line in glove: | |
name, vector = tuple(line.split(" ", 1)) | |
glove_wordmap[name] = np.fromstring(vector, sep=" ") | |
def sentence2sequence(sentence): | |
tokens = sentence.lower().split(" ") | |
rows = [] | |
words = [] | |
#Greedy search for tokens | |
for token in tokens: | |
i = len(token) | |
while len(token) > 0 and i > 0: | |
word = token[:i] | |
if word in glove_wordmap: | |
rows.append(glove_wordmap[word]) | |
words.append(word) | |
token = token[i:] | |
i = len(token) | |
else: | |
i = i-1 | |
return rows, words | |
rnn_size = 64 | |
rnn = tf.contrib.rnn.BasicRNNCell(rnn_size) | |
#Constants setup | |
max_hypothesis_length, max_evidence_length = 30, 30 | |
batch_size, vector_size, hidden_size = 128, 50, 64 | |
lstm_size = hidden_size | |
weight_decay = 0.0001 | |
learning_rate = 1 | |
input_p, output_p = 0.5, 0.5 | |
training_iterations_count = 100000 | |
display_step = 10 | |
def score_setup(row): | |
convert_dict = { | |
'entailment': 0, | |
'neutral': 1, | |
'contradiction': 2 | |
} | |
score = np.zeros((3,)) | |
for x in range(1,6): | |
tag = row["label"+str(x)] | |
if tag in convert_dict: score[convert_dict[tag]] += 1 | |
return score / (1.0*np.sum(score)) | |
def fit_to_size(matrix, shape): | |
res = np.zeros(shape) | |
slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)] | |
res[slices] = matrix[slices] | |
return res | |
def split_data_into_scores(): | |
import csv | |
with open(snli_dev_file,"r") as data: | |
train = csv.DictReader(data, delimiter='\t') | |
evi_sentences = [] | |
hyp_sentences = [] | |
labels = [] | |
scores = [] | |
for row in train: | |
hyp_sentences.append(np.vstack( | |
sentence2sequence(row["sentence1"].lower())[0])) | |
evi_sentences.append(np.vstack( | |
sentence2sequence(row["sentence2"].lower())[0])) | |
labels.append(row["gold_label"]) | |
scores.append(score_setup(row)) | |
hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size)) | |
for x in hyp_sentences]) | |
evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size)) | |
for x in evi_sentences]) | |
return (hyp_sentences, evi_sentences), labels, np.array(scores) | |
data_feature_list, correct_values, correct_scores = split_data_into_scores() | |
l_h, l_e = max_hypothesis_length, max_evidence_length | |
N, D, H = batch_size, vector_size, hidden_size | |
l_seq = l_h + l_e | |
tf.reset_default_graph() | |
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) | |
lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p) | |
# N: The number of elements in each of our batches, | |
# which we use to train subsets of data for efficiency's sake. | |
# l_h: The maximum length of a hypothesis, or the second sentence. This is | |
# used because training an RNN is extraordinarily difficult without | |
# rolling it out to a fixed length. | |
# l_e: The maximum length of evidence, the first sentence. This is used | |
# because training an RNN is extraordinarily difficult without | |
# rolling it out to a fixed length. | |
# D: The size of our used GloVe or other vectors. | |
hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis') | |
evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence') | |
y = tf.placeholder(tf.float32, [N, 3], 'label') | |
# hyp: Where the hypotheses will be stored during training. | |
# evi: Where the evidences will be stored during training. | |
# y: Where correct scores will be stored during training. | |
# lstm_size: the size of the gates in the LSTM, | |
# as in the first LSTM layer's initialization. | |
lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size) | |
# lstm_back: The LSTM used for looking backwards | |
# through the sentences, similar to lstm. | |
# input_p: the probability that inputs to the LSTM will be retained at each | |
# iteration of dropout. | |
# output_p: the probability that outputs from the LSTM will be retained at | |
# each iteration of dropout. | |
lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p) | |
# lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop. | |
fc_initializer = tf.random_normal_initializer(stddev=0.1) | |
# fc_initializer: initial values for the fully connected layer's weights. | |
# hidden_size: the size of the outputs from each lstm layer. | |
# Multiplied by 2 to account for the two LSTMs. | |
fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3], | |
initializer = fc_initializer) | |
# fc_weight: Storage for the fully connected layer's weights. | |
fc_bias = tf.get_variable('bias', [3]) | |
# fc_bias: Storage for the fully connected layer's bias. | |
# tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph | |
# designated for losses due to regularization. | |
# In this case, this portion of loss is regularization on the weights | |
# for the fully connected layer. | |
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, | |
tf.nn.l2_loss(fc_weight)) | |
x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d | |
# Permuting batch_size and n_steps | |
x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d | |
# Reshaping to (n_steps*batch_size, n_input) | |
x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d | |
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) | |
x = tf.split(x, l_seq,) | |
# x: the inputs to the bidirectional_rnn | |
# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through | |
# two recurrent networks, one that runs the inputs forward and one | |
# that runs the inputs in reversed order, combining the outputs. | |
rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back, | |
x, dtype=tf.float32) | |
# rnn_outputs: the list of LSTM outputs, as a list. | |
# What we want is the latest output, rnn_outputs[-1] | |
classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias | |
# The scores are relative certainties for how likely the output matches | |
# a certain entailment: | |
# 0: Positive entailment | |
# 1: Neutral entailment | |
# 2: Negative entailment | |
with tf.variable_scope('Accuracy'): | |
predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32') | |
y_label = tf.cast(tf.argmax(y, 1), 'int32') | |
corrects = tf.equal(predicts, y_label) | |
num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) | |
accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) | |
with tf.variable_scope("loss"): | |
cross_entropy = tf.nn.softmax_cross_entropy_with_logits( | |
logits = classification_scores, labels = y) | |
loss = tf.reduce_mean(cross_entropy) | |
total_loss = loss + weight_decay * tf.add_n( | |
tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) | |
optimizer = tf.train.GradientDescentOptimizer(learning_rate) | |
opt_op = optimizer.minimize(total_loss) | |
# Initialize variables | |
init = tf.global_variables_initializer() | |
# Use TQDM if installed | |
tqdm_installed = False | |
try: | |
from tqdm import tqdm | |
tqdm_installed = True | |
except: | |
pass | |
# Launch the Tensorflow session | |
sess = tf.Session() | |
writer = tf.summary.FileWriter('/log', sess.graph) # write to file | |
merge_op = tf.summary.merge_all() # operation to merge all summary | |
sess.run(init) | |
# training_iterations_count: The number of data pieces to train on in total | |
# batch_size: The number of data pieces per batch | |
training_iterations = range(0,training_iterations_count,batch_size) | |
if tqdm_installed: | |
# Add a progress bar if TQDM is installed | |
training_iterations = tqdm(training_iterations) | |
for i in training_iterations: | |
# Select indices for a random data subset | |
batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size) | |
# Use the selected subset indices to initialize the graph's | |
# placeholder values | |
hyps, evis, ys = (data_feature_list[0][batch,:], | |
data_feature_list[1][batch,:], | |
correct_scores[batch]) | |
# Run the optimization with these initialized values | |
r = sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys}) | |
# display_step: how often the accuracy and loss should | |
# be tested and displayed. | |
if (i/batch_size) % display_step == 0: | |
# Calculate batch accuracy | |
acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys}) | |
# Calculate batch loss | |
tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys}) | |
# Display results | |
print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \ | |
"{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \ | |
"{:.5f}".format(acc)) | |
summary = tf.Summary(value=[tf.Summary.Value(tag="Accuracy", | |
simple_value=acc)]) | |
writer.add_summary(summary, i) | |
evidences = ["Janos and Jade both were at the scene of the car crash."] | |
hypotheses = ["Multiple people saw the accident."] | |
sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]), | |
(30, 50)) for evidence in evidences] | |
sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]), | |
(30,50)) for hypothesis in hypotheses] | |
prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N), | |
evi: (sentence2 * N), | |
y: [[0,0,0]]*N}) | |
print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+ | |
" entailment") | |
sess.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
You need SNLI data from https://nlp.stanford.edu/projects/snli as well as the GloVe embedding data here https://nlp.stanford.edu/projects/glove/
Inspired by https://www.oreilly.com/learning/textual-entailment-with-tensorflow