katsugeneration/skip_thoughts_word2vec.py

## skip_thoughts_word2vec.py
# coding:utf-8

import chainer
from chainer import cuda
import chainer.links as L
import chainer.functions as F
from chainer import optimizers
from chainer import serializers

from chainer.functions.activation import sigmoid
from chainer.functions.activation import tanh
from chainer import link
from chainer.links.connection import linear

import argparse
import math
import sys
import time
import numpy as np
import six
import struct
import utils

class ConditionalStatefulGRU(link.Chain):
    def __init__(self, n_inputs, n_units, n_cond):
            super(ConditionalStatefulGRU, self).__init__(
                W_r=linear.Linear(n_inputs, n_units),
                U_r=linear.Linear(n_units, n_units),
                W_z=linear.Linear(n_inputs, n_units),
                U_z=linear.Linear(n_units, n_units),
                W=linear.Linear(n_inputs, n_units),
                U=linear.Linear(n_units, n_units),
                C_r=linear.Linear(n_cond, n_units),
                C_z=linear.Linear(n_cond, n_units),
                C=linear.Linear(n_cond, n_units),
            )
            self.reset_state()

    def to_cpu(self):
        super(ConditionalStatefulGRU, self).to_cpu()
        if self.h is not None:
            self.h.to_cpu()

    def to_gpu(self, device=None):
        super(ConditionalStatefulGRU, self).to_gpu(device)
        if self.h is not None:
            self.h.to_gpu(device)

    def set_state(self, h):
        assert isinstance(h, chainer.Variable)
        h_ = h
        if self.xp == np:
            h_.to_cpu()
        else:
            h_.to_gpu()
        self.h = h_

    def reset_state(self):
        self.h = None

    def __call__(self, x, cond):
        z = self.W_z(x)
        h_bar = self.W(x)
        if self.h is not None:
            r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond))
            z += self.U_z(self.h)
            h_bar += self.U(r * self.h)
        z = sigmoid.sigmoid(z + self.C_z(cond))
        h_bar = tanh.tanh(h_bar + self.C(cond))

        h_new = z * h_bar
        if self.h is not None:
            h_new += (1 - z) * self.h
        self.h = h_new
        return self.h


class SkipThought(chainer.Chain):
    def __init__(self, n_vocab, n_dec_units, n_cond, train=True):
        super(SkipThought, self).__init__(
            encoder=L.StatefulGRU(n_vocab, n_cond),
            decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
            decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
            embed_dec=L.Linear(n_dec_units, n_vocab),
        )
        self.train = train
        self.n_vocab = n_vocab
        self.n_cond = n_cond
        self.n_dec_units = n_dec_units
        self.volatile = 'off' if train else 'on'

    def __call__(self, x_words, y_words, z_words):
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

        cond = self.encode(x_words)
        cost_a = cost_b = 0
        word = get_word_vectors('<eos>')
        for next in y_words.data:
            y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
            emb_b = self.decoder_b(F.dropout(y_c), cond)
            next_b = self.embed_dec(F.dropout(emb_b))
            cost_b += F.mean_squared_error(next_b, y_n)
            word = next
            # print(np.argmax(next_b.data), next, cost_b.data)

        word = get_word_vectors('<eos>')
        for next in z_words.data:
            z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
            emb_a = self.decoder_a(F.dropout(z_c), cond)
            next_a = self.embed_dec(F.dropout(emb_b))
            cost_a += F.mean_squared_error(next_a, z_n)
            word = next

        self.loss = cost_a + cost_b
        return self.loss

    def encode(self, x_words):
        h = np.ndarray((1, self.n_cond), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond)
        self.encoder.set_state(chainer.Variable(h, volatile=self.volatile))

        for word in x_words.data:
            x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            cond = self.encoder(F.dropout(x, train=self.train))

        return cond

    def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster):
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

        cond = self.encode(x_words)
        back_sentence = []
        ahead_sentence = []
        word = get_word_vectors('<eos>')
        next = 0
        count = 0
        while(voc_inv[next] != stop_words and count <= 50):
            # print(self.decoder_b.h.data)
            y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            emb_b = self.decoder_b(y_c, cond)
            next_b = self.embed_dec(emb_b)
            # print(next_b.data / utils.norm(next_b.data[0]))
            cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0]))
            now = time.time()
            cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b]
            error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32))
            print(time.time() - now)
            prob_b = F.softmax(error_b).data[0]
            next = np.random.multinomial(1, prob_b).argmax()
            print(cluster_b, next, prob_b[next])
            word = get_word_vectors(cluster_voc[next])
            back_sentence.append(cluster_voc[next])
            count += 1

        # word = get_word_vectors('<eos>')
        # next = 0
        # count = 0
        # while(voc_inv[next] != stop_words and count <= 50):
        #     z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
        #     emb_a = self.decoder_a(z_c, cond)
        #     next_a = self.embed_dec(emb_b)
        #     error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32))
        #     prob_a = F.softmax(error_a).data[0] - 0.0001
        #     next = np.random.multinomial(1, prob_a).argmax()
        #     word = get_word_vectors(next)
        #     ahead_sentence.append(next)
        #     count += 1

        return back_sentence, ahead_sentence

parser = argparse.ArgumentParser()
parser.add_argument('--source', '-s', default='',
                    help='source text separated words')
parser.add_argument('--dict', '-d', default='',
                    help='word vector dictionary')
parser.add_argument('--initmodel', '-m', default='',
                    help='Initialize the model from given file')
parser.add_argument('--gpu', '-g', default=-1, type=int,
                    help='GPU ID (negative value indicates CPU)')
parser.add_argument('--epoch', '-e', default=20, type=int,
                    help='number of epochs to learn')
parser.add_argument('--unit', '-u', default=2400, type=int,
                    help='number of units')
parser.add_argument('--batchsize', '-b', type=int, default=20,
                    help='learning minibatch size')
parser.add_argument('--gradclip', '-c', type=int, default=5,
                    help='gradient norm threshold to clip')
parser.add_argument('--test', dest='test', action='store_true')
parser.add_argument('--decode', dest='decode', action='store_true')
parser.set_defaults(test=False)
parser.set_defaults(decode=False)

args = parser.parse_args()
xp = cuda.cupy if args.gpu >= 0 else np

n_epoch = args.epoch   # number of epochs
n_units = args.unit  # number of units per layer
batchsize = args.batchsize   # minibatch size
grad_clip = args.gradclip    # gradient norm threshold to clip

def get_word_vectors(word):
    global vocab
    if word in vocab:
        return vocab[word]
    else:
        print(word)
        ret = np.zeros((vector_size,), dtype=np.float32)
        ret[0] = 1.
        return ret

vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict)
km = utils.make_cluster(list(vocab.values()), 50)
print(km.counts_)

# Prepare RNNLM model, defined in net.py
model = SkipThought(vector_size, n_units, n_units, train=not args.decode)
model.compute_accuracy = False  # we only want the perplexity
for param in model.params():
    data = param.data
    data[:] = np.random.uniform(-0.1, 0.1, data.shape)
if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()

# Setup optimizer
optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8)
optimizer.setup(model)

# Init/Resume
if args.initmodel:
    print('Load model from', args.initmodel)
    serializers.load_npz(args.initmodel, model)

if args.decode:
    i = 0
    train_data = open(args.source)
    for paragraph in train_data:
        paragraph.replace('\n', '').strip()
        for line in paragraph.split('。'):
            words = line.strip().split()
            if len(words) == 0:
                continue
            i += 1
            words.append('。')
            words.append('<eos>')
            print(words)
            x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on')
            back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km)
            print(back)
            print()
        if args.test:
            if i >= 100:
                break

else:
    # Learning loop
    cur_log_perp = xp.zeros(())
    epoch = 0
    start_at = time.time()
    cur_at = start_at
    accum_loss = 0
    print('going to train {} iterations'.format(n_epoch))

    i = 0
    for epoch in six.moves.range(n_epoch):
        back_words = None
        current_words = None
        ahead_words = None
        train_data = open(args.source)
        for paragraph in train_data:
            paragraph.replace('\n', '').strip()
            for line in paragraph.split('。'):
                words = line.strip().split()
                if len(words) == 0:
                    continue
                i += 1
                back_words = current_words
                current_words = ahead_words
                ahead_words = words
                ahead_words.append('。')
                ahead_words.append('<eos>')

                if back_words is None or current_words is None:
                    continue

                x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words]))
                back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words]))
                ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words]))
                loss_i = model(x, back, ahead)
                accum_loss += loss_i
                cur_log_perp += model.loss.data

                if (i + 1) % batchsize == 0:  # Run truncated BPTT
                    model.zerograds()
                    accum_loss.backward()
                    accum_loss.unchain_backward()
                    optimizer.update()
                    accum_loss = 0

                if (i + 1) % 100 == 0:
                    now = time.time()
                    throuput = 100. / (now - cur_at)
                    perp = float(cur_log_perp) / 100
                    print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format(
                        i + 1, perp, throuput))
                    cur_at = now
                    cur_log_perp.fill(0)

                if (i + 1) % 10000 == 0:
                    # Save the model and the optimizer
                    model.to_cpu()
                    print('save the model')
                    serializers.save_npz('skip-thought.model', model)
                    print('save the optimizer')
                    serializers.save_npz('skip-thought.state', optimizer)

                sys.stdout.flush()
            if args.test:
                if i >= 100 * (epoch + 1):
                    break

    # Save the model and the optimizer
    model.to_cpu()
    print('save the model')
    serializers.save_npz('skip-thought.model', model)
    print('save the optimizer')
    serializers.save_npz('skip-thought.state', optimizer)
	# coding:utf-8

	import chainer
	from chainer import cuda
	import chainer.links as L
	import chainer.functions as F
	from chainer import optimizers
	from chainer import serializers

	from chainer.functions.activation import sigmoid
	from chainer.functions.activation import tanh
	from chainer import link
	from chainer.links.connection import linear

	import argparse
	import math
	import sys
	import time
	import numpy as np
	import six
	import struct
	import utils

	class ConditionalStatefulGRU(link.Chain):
	def __init__(self, n_inputs, n_units, n_cond):
	super(ConditionalStatefulGRU, self).__init__(
	W_r=linear.Linear(n_inputs, n_units),
	U_r=linear.Linear(n_units, n_units),
	W_z=linear.Linear(n_inputs, n_units),
	U_z=linear.Linear(n_units, n_units),
	W=linear.Linear(n_inputs, n_units),
	U=linear.Linear(n_units, n_units),
	C_r=linear.Linear(n_cond, n_units),
	C_z=linear.Linear(n_cond, n_units),
	C=linear.Linear(n_cond, n_units),
	)
	self.reset_state()

	def to_cpu(self):
	super(ConditionalStatefulGRU, self).to_cpu()
	if self.h is not None:
	self.h.to_cpu()

	def to_gpu(self, device=None):
	super(ConditionalStatefulGRU, self).to_gpu(device)
	if self.h is not None:
	self.h.to_gpu(device)

	def set_state(self, h):
	assert isinstance(h, chainer.Variable)
	h_ = h
	if self.xp == np:
	h_.to_cpu()
	else:
	h_.to_gpu()
	self.h = h_

	def reset_state(self):
	self.h = None

	def __call__(self, x, cond):
	z = self.W_z(x)
	h_bar = self.W(x)
	if self.h is not None:
	r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond))
	z += self.U_z(self.h)
	h_bar += self.U(r * self.h)
	z = sigmoid.sigmoid(z + self.C_z(cond))
	h_bar = tanh.tanh(h_bar + self.C(cond))

	h_new = z * h_bar
	if self.h is not None:
	h_new += (1 - z) * self.h
	self.h = h_new
	return self.h


	class SkipThought(chainer.Chain):
	def __init__(self, n_vocab, n_dec_units, n_cond, train=True):
	super(SkipThought, self).__init__(
	encoder=L.StatefulGRU(n_vocab, n_cond),
	decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
	decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
	embed_dec=L.Linear(n_dec_units, n_vocab),
	)
	self.train = train
	self.n_vocab = n_vocab
	self.n_cond = n_cond
	self.n_dec_units = n_dec_units
	self.volatile = 'off' if train else 'on'

	def __call__(self, x_words, y_words, z_words):
	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

	cond = self.encode(x_words)
	cost_a = cost_b = 0
	word = get_word_vectors('<eos>')
	for next in y_words.data:
	y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
	emb_b = self.decoder_b(F.dropout(y_c), cond)
	next_b = self.embed_dec(F.dropout(emb_b))
	cost_b += F.mean_squared_error(next_b, y_n)
	word = next
	# print(np.argmax(next_b.data), next, cost_b.data)

	word = get_word_vectors('<eos>')
	for next in z_words.data:
	z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
	emb_a = self.decoder_a(F.dropout(z_c), cond)
	next_a = self.embed_dec(F.dropout(emb_b))
	cost_a += F.mean_squared_error(next_a, z_n)
	word = next

	self.loss = cost_a + cost_b
	return self.loss

	def encode(self, x_words):
	h = np.ndarray((1, self.n_cond), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond)
	self.encoder.set_state(chainer.Variable(h, volatile=self.volatile))

	for word in x_words.data:
	x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	cond = self.encoder(F.dropout(x, train=self.train))

	return cond

	def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster):
	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

	cond = self.encode(x_words)
	back_sentence = []
	ahead_sentence = []
	word = get_word_vectors('<eos>')
	next = 0
	count = 0
	while(voc_inv[next] != stop_words and count <= 50):
	# print(self.decoder_b.h.data)
	y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	emb_b = self.decoder_b(y_c, cond)
	next_b = self.embed_dec(emb_b)
	# print(next_b.data / utils.norm(next_b.data[0]))
	cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0]))
	now = time.time()
	cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b]
	error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32))
	print(time.time() - now)
	prob_b = F.softmax(error_b).data[0]
	next = np.random.multinomial(1, prob_b).argmax()
	print(cluster_b, next, prob_b[next])
	word = get_word_vectors(cluster_voc[next])
	back_sentence.append(cluster_voc[next])
	count += 1

	# word = get_word_vectors('<eos>')
	# next = 0
	# count = 0
	# while(voc_inv[next] != stop_words and count <= 50):
	# z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	# emb_a = self.decoder_a(z_c, cond)
	# next_a = self.embed_dec(emb_b)
	# error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32))
	# prob_a = F.softmax(error_a).data[0] - 0.0001
	# next = np.random.multinomial(1, prob_a).argmax()
	# word = get_word_vectors(next)
	# ahead_sentence.append(next)
	# count += 1

	return back_sentence, ahead_sentence

	parser = argparse.ArgumentParser()
	parser.add_argument('--source', '-s', default='',
	help='source text separated words')
	parser.add_argument('--dict', '-d', default='',
	help='word vector dictionary')
	parser.add_argument('--initmodel', '-m', default='',
	help='Initialize the model from given file')
	parser.add_argument('--gpu', '-g', default=-1, type=int,
	help='GPU ID (negative value indicates CPU)')
	parser.add_argument('--epoch', '-e', default=20, type=int,
	help='number of epochs to learn')
	parser.add_argument('--unit', '-u', default=2400, type=int,
	help='number of units')
	parser.add_argument('--batchsize', '-b', type=int, default=20,
	help='learning minibatch size')
	parser.add_argument('--gradclip', '-c', type=int, default=5,
	help='gradient norm threshold to clip')
	parser.add_argument('--test', dest='test', action='store_true')
	parser.add_argument('--decode', dest='decode', action='store_true')
	parser.set_defaults(test=False)
	parser.set_defaults(decode=False)

	args = parser.parse_args()
	xp = cuda.cupy if args.gpu >= 0 else np

	n_epoch = args.epoch # number of epochs
	n_units = args.unit # number of units per layer
	batchsize = args.batchsize # minibatch size
	grad_clip = args.gradclip # gradient norm threshold to clip

	def get_word_vectors(word):
	global vocab
	if word in vocab:
	return vocab[word]
	else:
	print(word)
	ret = np.zeros((vector_size,), dtype=np.float32)
	ret[0] = 1.
	return ret

	vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict)
	km = utils.make_cluster(list(vocab.values()), 50)
	print(km.counts_)

	# Prepare RNNLM model, defined in net.py
	model = SkipThought(vector_size, n_units, n_units, train=not args.decode)
	model.compute_accuracy = False # we only want the perplexity
	for param in model.params():
	data = param.data
	data[:] = np.random.uniform(-0.1, 0.1, data.shape)
	if args.gpu >= 0:
	cuda.get_device(args.gpu).use()
	model.to_gpu()

	# Setup optimizer
	optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8)
	optimizer.setup(model)

	# Init/Resume
	if args.initmodel:
	print('Load model from', args.initmodel)
	serializers.load_npz(args.initmodel, model)

	if args.decode:
	i = 0
	train_data = open(args.source)
	for paragraph in train_data:
	paragraph.replace('\n', '').strip()
	for line in paragraph.split('。'):
	words = line.strip().split()
	if len(words) == 0:
	continue
	i += 1
	words.append('。')
	words.append('<eos>')
	print(words)
	x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on')
	back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km)
	print(back)
	print()
	if args.test:
	if i >= 100:
	break

	else:
	# Learning loop
	cur_log_perp = xp.zeros(())
	epoch = 0
	start_at = time.time()
	cur_at = start_at
	accum_loss = 0
	print('going to train {} iterations'.format(n_epoch))

	i = 0
	for epoch in six.moves.range(n_epoch):
	back_words = None
	current_words = None
	ahead_words = None
	train_data = open(args.source)
	for paragraph in train_data:
	paragraph.replace('\n', '').strip()
	for line in paragraph.split('。'):
	words = line.strip().split()
	if len(words) == 0:
	continue
	i += 1
	back_words = current_words
	current_words = ahead_words
	ahead_words = words
	ahead_words.append('。')
	ahead_words.append('<eos>')

	if back_words is None or current_words is None:
	continue

	x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words]))
	back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words]))
	ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words]))
	loss_i = model(x, back, ahead)
	accum_loss += loss_i
	cur_log_perp += model.loss.data

	if (i + 1) % batchsize == 0: # Run truncated BPTT
	model.zerograds()
	accum_loss.backward()
	accum_loss.unchain_backward()
	optimizer.update()
	accum_loss = 0

	if (i + 1) % 100 == 0:
	now = time.time()
	throuput = 100. / (now - cur_at)
	perp = float(cur_log_perp) / 100
	print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format(
	i + 1, perp, throuput))
	cur_at = now
	cur_log_perp.fill(0)

	if (i + 1) % 10000 == 0:
	# Save the model and the optimizer
	model.to_cpu()
	print('save the model')
	serializers.save_npz('skip-thought.model', model)
	print('save the optimizer')
	serializers.save_npz('skip-thought.state', optimizer)

	sys.stdout.flush()
	if args.test:
	if i >= 100 * (epoch + 1):
	break

	# Save the model and the optimizer
	model.to_cpu()
	print('save the model')
	serializers.save_npz('skip-thought.model', model)
	print('save the optimizer')
	serializers.save_npz('skip-thought.state', optimizer)