Skip to content

Instantly share code, notes, and snippets.

@katsugeneration
Created May 15, 2016 07:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save katsugeneration/4306aada0dceb73f6ee5377ec24b77fb to your computer and use it in GitHub Desktop.
Save katsugeneration/4306aada0dceb73f6ee5377ec24b77fb to your computer and use it in GitHub Desktop.
Skip-Thought Vectors を word2vec を入出力にして行う実験用コード
# coding:utf-8
import chainer
from chainer import cuda
import chainer.links as L
import chainer.functions as F
from chainer import optimizers
from chainer import serializers
from chainer.functions.activation import sigmoid
from chainer.functions.activation import tanh
from chainer import link
from chainer.links.connection import linear
import argparse
import math
import sys
import time
import numpy as np
import six
import struct
import utils
class ConditionalStatefulGRU(link.Chain):
def __init__(self, n_inputs, n_units, n_cond):
super(ConditionalStatefulGRU, self).__init__(
W_r=linear.Linear(n_inputs, n_units),
U_r=linear.Linear(n_units, n_units),
W_z=linear.Linear(n_inputs, n_units),
U_z=linear.Linear(n_units, n_units),
W=linear.Linear(n_inputs, n_units),
U=linear.Linear(n_units, n_units),
C_r=linear.Linear(n_cond, n_units),
C_z=linear.Linear(n_cond, n_units),
C=linear.Linear(n_cond, n_units),
)
self.reset_state()
def to_cpu(self):
super(ConditionalStatefulGRU, self).to_cpu()
if self.h is not None:
self.h.to_cpu()
def to_gpu(self, device=None):
super(ConditionalStatefulGRU, self).to_gpu(device)
if self.h is not None:
self.h.to_gpu(device)
def set_state(self, h):
assert isinstance(h, chainer.Variable)
h_ = h
if self.xp == np:
h_.to_cpu()
else:
h_.to_gpu()
self.h = h_
def reset_state(self):
self.h = None
def __call__(self, x, cond):
z = self.W_z(x)
h_bar = self.W(x)
if self.h is not None:
r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond))
z += self.U_z(self.h)
h_bar += self.U(r * self.h)
z = sigmoid.sigmoid(z + self.C_z(cond))
h_bar = tanh.tanh(h_bar + self.C(cond))
h_new = z * h_bar
if self.h is not None:
h_new += (1 - z) * self.h
self.h = h_new
return self.h
class SkipThought(chainer.Chain):
def __init__(self, n_vocab, n_dec_units, n_cond, train=True):
super(SkipThought, self).__init__(
encoder=L.StatefulGRU(n_vocab, n_cond),
decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
embed_dec=L.Linear(n_dec_units, n_vocab),
)
self.train = train
self.n_vocab = n_vocab
self.n_cond = n_cond
self.n_dec_units = n_dec_units
self.volatile = 'off' if train else 'on'
def __call__(self, x_words, y_words, z_words):
h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))
h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))
cond = self.encode(x_words)
cost_a = cost_b = 0
word = get_word_vectors('<eos>')
for next in y_words.data:
y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
emb_b = self.decoder_b(F.dropout(y_c), cond)
next_b = self.embed_dec(F.dropout(emb_b))
cost_b += F.mean_squared_error(next_b, y_n)
word = next
# print(np.argmax(next_b.data), next, cost_b.data)
word = get_word_vectors('<eos>')
for next in z_words.data:
z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
emb_a = self.decoder_a(F.dropout(z_c), cond)
next_a = self.embed_dec(F.dropout(emb_b))
cost_a += F.mean_squared_error(next_a, z_n)
word = next
self.loss = cost_a + cost_b
return self.loss
def encode(self, x_words):
h = np.ndarray((1, self.n_cond), dtype=np.float32)
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond)
self.encoder.set_state(chainer.Variable(h, volatile=self.volatile))
for word in x_words.data:
x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
cond = self.encoder(F.dropout(x, train=self.train))
return cond
def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster):
h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))
h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))
cond = self.encode(x_words)
back_sentence = []
ahead_sentence = []
word = get_word_vectors('<eos>')
next = 0
count = 0
while(voc_inv[next] != stop_words and count <= 50):
# print(self.decoder_b.h.data)
y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
emb_b = self.decoder_b(y_c, cond)
next_b = self.embed_dec(emb_b)
# print(next_b.data / utils.norm(next_b.data[0]))
cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0]))
now = time.time()
cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b]
error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32))
print(time.time() - now)
prob_b = F.softmax(error_b).data[0]
next = np.random.multinomial(1, prob_b).argmax()
print(cluster_b, next, prob_b[next])
word = get_word_vectors(cluster_voc[next])
back_sentence.append(cluster_voc[next])
count += 1
# word = get_word_vectors('<eos>')
# next = 0
# count = 0
# while(voc_inv[next] != stop_words and count <= 50):
# z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
# emb_a = self.decoder_a(z_c, cond)
# next_a = self.embed_dec(emb_b)
# error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32))
# prob_a = F.softmax(error_a).data[0] - 0.0001
# next = np.random.multinomial(1, prob_a).argmax()
# word = get_word_vectors(next)
# ahead_sentence.append(next)
# count += 1
return back_sentence, ahead_sentence
parser = argparse.ArgumentParser()
parser.add_argument('--source', '-s', default='',
help='source text separated words')
parser.add_argument('--dict', '-d', default='',
help='word vector dictionary')
parser.add_argument('--initmodel', '-m', default='',
help='Initialize the model from given file')
parser.add_argument('--gpu', '-g', default=-1, type=int,
help='GPU ID (negative value indicates CPU)')
parser.add_argument('--epoch', '-e', default=20, type=int,
help='number of epochs to learn')
parser.add_argument('--unit', '-u', default=2400, type=int,
help='number of units')
parser.add_argument('--batchsize', '-b', type=int, default=20,
help='learning minibatch size')
parser.add_argument('--gradclip', '-c', type=int, default=5,
help='gradient norm threshold to clip')
parser.add_argument('--test', dest='test', action='store_true')
parser.add_argument('--decode', dest='decode', action='store_true')
parser.set_defaults(test=False)
parser.set_defaults(decode=False)
args = parser.parse_args()
xp = cuda.cupy if args.gpu >= 0 else np
n_epoch = args.epoch # number of epochs
n_units = args.unit # number of units per layer
batchsize = args.batchsize # minibatch size
grad_clip = args.gradclip # gradient norm threshold to clip
def get_word_vectors(word):
global vocab
if word in vocab:
return vocab[word]
else:
print(word)
ret = np.zeros((vector_size,), dtype=np.float32)
ret[0] = 1.
return ret
vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict)
km = utils.make_cluster(list(vocab.values()), 50)
print(km.counts_)
# Prepare RNNLM model, defined in net.py
model = SkipThought(vector_size, n_units, n_units, train=not args.decode)
model.compute_accuracy = False # we only want the perplexity
for param in model.params():
data = param.data
data[:] = np.random.uniform(-0.1, 0.1, data.shape)
if args.gpu >= 0:
cuda.get_device(args.gpu).use()
model.to_gpu()
# Setup optimizer
optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8)
optimizer.setup(model)
# Init/Resume
if args.initmodel:
print('Load model from', args.initmodel)
serializers.load_npz(args.initmodel, model)
if args.decode:
i = 0
train_data = open(args.source)
for paragraph in train_data:
paragraph.replace('\n', '').strip()
for line in paragraph.split('。'):
words = line.strip().split()
if len(words) == 0:
continue
i += 1
words.append('。')
words.append('<eos>')
print(words)
x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on')
back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km)
print(back)
print()
if args.test:
if i >= 100:
break
else:
# Learning loop
cur_log_perp = xp.zeros(())
epoch = 0
start_at = time.time()
cur_at = start_at
accum_loss = 0
print('going to train {} iterations'.format(n_epoch))
i = 0
for epoch in six.moves.range(n_epoch):
back_words = None
current_words = None
ahead_words = None
train_data = open(args.source)
for paragraph in train_data:
paragraph.replace('\n', '').strip()
for line in paragraph.split('。'):
words = line.strip().split()
if len(words) == 0:
continue
i += 1
back_words = current_words
current_words = ahead_words
ahead_words = words
ahead_words.append('。')
ahead_words.append('<eos>')
if back_words is None or current_words is None:
continue
x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words]))
back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words]))
ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words]))
loss_i = model(x, back, ahead)
accum_loss += loss_i
cur_log_perp += model.loss.data
if (i + 1) % batchsize == 0: # Run truncated BPTT
model.zerograds()
accum_loss.backward()
accum_loss.unchain_backward()
optimizer.update()
accum_loss = 0
if (i + 1) % 100 == 0:
now = time.time()
throuput = 100. / (now - cur_at)
perp = float(cur_log_perp) / 100
print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format(
i + 1, perp, throuput))
cur_at = now
cur_log_perp.fill(0)
if (i + 1) % 10000 == 0:
# Save the model and the optimizer
model.to_cpu()
print('save the model')
serializers.save_npz('skip-thought.model', model)
print('save the optimizer')
serializers.save_npz('skip-thought.state', optimizer)
sys.stdout.flush()
if args.test:
if i >= 100 * (epoch + 1):
break
# Save the model and the optimizer
model.to_cpu()
print('save the model')
serializers.save_npz('skip-thought.model', model)
print('save the optimizer')
serializers.save_npz('skip-thought.state', optimizer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment