nemo/error.txt

## error.txt
theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/scan_perform/mod.cpp:4193)()

ValueError: dimension mismatch in args to gemm (64,256)x(256,256)->(1,256)
Apply node that caused the error: GpuGemm{no_inplace}(GpuSubtensor{::, int64::}.0, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, lstm_7_U_o_copy[cuda], TensorConstant{0.20000000298})
Toposort index: 5
Inputs types: [CudaNdarrayType(float32, matrix), TensorType(float32, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), TensorType(float32, scalar)]
Inputs shapes: [(1, 256), (), (64, 256), (256, 256), ()]
Inputs strides: [(0, 1), (), (256, 1), (256, 1), ()]
Inputs values: ['not shown', array(0.20000000298023224, dtype=float32), 'not shown', 'not shown', array(0.20000000298023224, dtype=float32)]
Outputs clients: [[GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * i4) + (clip((i0 + i5), i2, i3) * tanh(i6)))},no_inplace}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,gpu,scan_fn}(TensorConstant{10}, GpuDimShuffle{1,0,2}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, TensorConstant{10}, lstm_7_U_o, lstm_7_U_f, lstm_7_U_i, lstm_7_U_c)
Toposort index: 75
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix)]
Inputs shapes: [(), (10, 1, 1024), (2, 64, 256), (2, 64, 256), (), (256, 256), (256, 256), (256, 256), (256, 256)]
Inputs strides: [(), (1024, 0, 1), (16384, 256, 1), (16384, 256, 1), (), (256, 1), (256, 1), (256, 1), (256, 1)]
Inputs values: [array(10), 'not shown', 'not shown', 'not shown', array(10), 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, Constant{1})], [GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.1, Constant{1})], [GpuDimShuffle{0,1,2}(forall_inplace,gpu,scan_fn}.2)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## keras_model_rnn_error.py
import re
import pickle
from theano.sandbox import cuda
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
from keras.layers import *
from keras.layers.normalization import *

from keras.datasets import imdb
idx = imdb.get_word_index()

path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

idx2word = {v: k for k, v in idx.iteritems()}
articles = [' '.join([idx2word[word] for word in review]) for review in x_train]

idx = {}
article_words = [name.decode('unicode_escape').encode('ascii','ignore').split(" ") for name in articles]
words = [item for sublist in article_words for item in sublist]

for i in range(0, len(words)):
    if not words[i] in idx:
        idx[words[i]] = 1
    else:
        idx[words[i]] += 1;
idx_arr = sorted(idx, key=idx.get, reverse=True)
vocab_size = 5000

idx2word = {idx_arr.index(v): v for v in idx_arr}
word2idx = {v: k for k, v in idx2word.iteritems()}


def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))
vecs, words, wordidx = load_vectors('data/6B.50d')


def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word) and word in wordidx:
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

emb = create_emb()
n_fact = vecs.shape[1]
word_length = min(10, int(np.floor(np.mean([len(wds) for wds in article_words]))))
batch_size = 64
n_hidden = 256

c_in_dat = [[np.clip(word2idx[article_words[articleidx][widx]], 0, vocab_size-1) for widx in range(0, min(len(article_words[articleidx])-1, word_length-1))]
            for articleidx in range(len(article_words))]
c_out_dat = [[np.clip(word2idx[article_words[articleidx][widx]], 0, vocab_size-1) for widx in range(1, min(len(article_words[articleidx]), word_length))]
            for articleidx in range(len(article_words))]

x_rnn = sequence.pad_sequences(c_in_dat, maxlen=word_length, value=0)
ys = sequence.pad_sequences(c_out_dat, maxlen=word_length, value=0)

x_rnn = x_rnn.clip(0, vocab_size-1)
y_rnn = y_rnn.clip(0, vocab_size-1)
y_rnn = np.expand_dims(np.array(ys), -1)


model=Sequential([
        Embedding(vocab_size, n_fact, input_length=word_length,
                  weights=[emb],
                  trainable=False,
                  batch_input_shape=(batch_size, word_length)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-15))
mx = len(x_rnn)//batch_size*batch_size
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=batch_size, nb_epoch=4, shuffle=False)

def get_nexts_keras(inp):
    idxs = [word2idx[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [idx2word[np.argmax(o)] for o in p]


get_nexts_keras(['', 'that', '', '', '', '', '', '', '', ''])
n_hidden = 256
batch_size = 64
word_length = int(np.floor(np.mean([len(wds) for wds in article_words]))) # becomes 8
n_fact = vecs.shape[1] #50

model=Sequential([
        Embedding(vocab_size, n_fact, input_length=word_length,
                  weights=[emb],
                  trainable=False,
                  batch_input_shape=(batch_size, word_length)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-15))
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=batch_size, nb_epoch=1, shuffle=False) #works just fine

def get_nexts_keras(inp):
    idxs = [word2idx[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr, verbose=True)[0]
    print(list(inp))
    return [idx_arr[np.argmax(o)] for o in p]

get_nexts_keras(["Netflix", '', '', '', '', '', '', ''])
# Throws error (attached)
	theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/scan_perform/mod.cpp:4193)()

	ValueError: dimension mismatch in args to gemm (64,256)x(256,256)->(1,256)
	Apply node that caused the error: GpuGemm{no_inplace}(GpuSubtensor{::, int64::}.0, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, lstm_7_U_o_copy[cuda], TensorConstant{0.20000000298})
	Toposort index: 5
	Inputs types: [CudaNdarrayType(float32, matrix), TensorType(float32, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), TensorType(float32, scalar)]
	Inputs shapes: [(1, 256), (), (64, 256), (256, 256), ()]
	Inputs strides: [(0, 1), (), (256, 1), (256, 1), ()]
	Inputs values: ['not shown', array(0.20000000298023224, dtype=float32), 'not shown', 'not shown', array(0.20000000298023224, dtype=float32)]
	Outputs clients: [[GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * i4) + (clip((i0 + i5), i2, i3) * tanh(i6)))},no_inplace}.0)]]

	HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
	HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
	Apply node that caused the error: forall_inplace,gpu,scan_fn}(TensorConstant{10}, GpuDimShuffle{1,0,2}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, TensorConstant{10}, lstm_7_U_o, lstm_7_U_f, lstm_7_U_i, lstm_7_U_c)
	Toposort index: 75
	Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix)]
	Inputs shapes: [(), (10, 1, 1024), (2, 64, 256), (2, 64, 256), (), (256, 256), (256, 256), (256, 256), (256, 256)]
	Inputs strides: [(), (1024, 0, 1), (16384, 256, 1), (16384, 256, 1), (), (256, 1), (256, 1), (256, 1), (256, 1)]
	Inputs values: [array(10), 'not shown', 'not shown', 'not shown', array(10), 'not shown', 'not shown', 'not shown', 'not shown']
	Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, Constant{1})], [GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.1, Constant{1})], [GpuDimShuffle{0,1,2}(forall_inplace,gpu,scan_fn}.2)]]

	HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
	HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
	import re
	import pickle
	from theano.sandbox import cuda
	%matplotlib inline
	import utils; reload(utils)
	from utils import *
	from __future__ import division, print_function
	from keras.layers import *
	from keras.layers.normalization import *

	from keras.datasets import imdb
	idx = imdb.get_word_index()

	path = get_file('imdb_full.pkl',
	origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
	md5_hash='d091312047c43cf9e4e38fef92437263')
	f = open(path, 'rb')
	(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

	idx2word = {v: k for k, v in idx.iteritems()}
	articles = [' '.join([idx2word[word] for word in review]) for review in x_train]

	idx = {}
	article_words = [name.decode('unicode_escape').encode('ascii','ignore').split(" ") for name in articles]
	words = [item for sublist in article_words for item in sublist]

	for i in range(0, len(words)):
	if not words[i] in idx:
	idx[words[i]] = 1
	else:
	idx[words[i]] += 1;
	idx_arr = sorted(idx, key=idx.get, reverse=True)
	vocab_size = 5000

	idx2word = {idx_arr.index(v): v for v in idx_arr}
	word2idx = {v: k for k, v in idx2word.iteritems()}


	def load_vectors(loc):
	return (load_array(loc+'.dat'),
	pickle.load(open(loc+'_words.pkl','rb')),
	pickle.load(open(loc+'_idx.pkl','rb')))
	vecs, words, wordidx = load_vectors('data/6B.50d')


	def create_emb():
	n_fact = vecs.shape[1]
	emb = np.zeros((vocab_size, n_fact))

	for i in range(1,len(emb)):
	word = idx2word[i]
	if word and re.match(r"^[a-zA-Z0-9\-]*$", word) and word in wordidx:
	src_idx = wordidx[word]
	emb[i] = vecs[src_idx]
	else:
	# If we can't find the word in glove, randomly initialize
	emb[i] = normal(scale=0.6, size=(n_fact,))

	# This is our "rare word" id - we want to randomly initialize
	emb[-1] = normal(scale=0.6, size=(n_fact,))
	emb/=3
	return emb

	emb = create_emb()
	n_fact = vecs.shape[1]
	word_length = min(10, int(np.floor(np.mean([len(wds) for wds in article_words]))))
	batch_size = 64
	n_hidden = 256

	c_in_dat = [[np.clip(word2idx[article_words[articleidx][widx]], 0, vocab_size-1) for widx in range(0, min(len(article_words[articleidx])-1, word_length-1))]
	for articleidx in range(len(article_words))]
	c_out_dat = [[np.clip(word2idx[article_words[articleidx][widx]], 0, vocab_size-1) for widx in range(1, min(len(article_words[articleidx]), word_length))]
	for articleidx in range(len(article_words))]

	x_rnn = sequence.pad_sequences(c_in_dat, maxlen=word_length, value=0)
	ys = sequence.pad_sequences(c_out_dat, maxlen=word_length, value=0)

	x_rnn = x_rnn.clip(0, vocab_size-1)
	y_rnn = y_rnn.clip(0, vocab_size-1)
	y_rnn = np.expand_dims(np.array(ys), -1)



	model=Sequential([
	Embedding(vocab_size, n_fact, input_length=word_length,
	weights=[emb],
	trainable=False,
	batch_input_shape=(batch_size, word_length)),
	BatchNormalization(),
	LSTM(n_hidden, return_sequences=True, stateful=True),
	TimeDistributed(Dense(vocab_size, activation='softmax')),
	])

	model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-15))
	mx = len(x_rnn)//batch_size*batch_size
	model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=batch_size, nb_epoch=4, shuffle=False)

	def get_nexts_keras(inp):
	idxs = [word2idx[c] for c in inp]
	arr = np.array(idxs)[np.newaxis,:]
	p = model.predict(arr)[0]
	print(list(inp))
	return [idx2word[np.argmax(o)] for o in p]


	get_nexts_keras(['', 'that', '', '', '', '', '', '', '', ''])
	n_hidden = 256
	batch_size = 64
	word_length = int(np.floor(np.mean([len(wds) for wds in article_words]))) # becomes 8
	n_fact = vecs.shape[1] #50

	model=Sequential([
	Embedding(vocab_size, n_fact, input_length=word_length,
	weights=[emb],
	trainable=False,
	batch_input_shape=(batch_size, word_length)),
	BatchNormalization(),
	LSTM(n_hidden, return_sequences=True, stateful=True),
	TimeDistributed(Dense(vocab_size, activation='softmax')),
	])

	model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-15))
	model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=batch_size, nb_epoch=1, shuffle=False) #works just fine

	def get_nexts_keras(inp):
	idxs = [word2idx[c] for c in inp]
	arr = np.array(idxs)[np.newaxis,:]
	p = model.predict(arr, verbose=True)[0]
	print(list(inp))
	return [idx_arr[np.argmax(o)] for o in p]

	get_nexts_keras(["Netflix", '', '', '', '', '', '', ''])
	# Throws error (attached)