Skip to content

Instantly share code, notes, and snippets.

@JonathanRaiman
Last active March 22, 2019 11:49
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save JonathanRaiman/5a07eadc0e180ca944dee5f6485695e1 to your computer and use it in GitHub Desktop.
Save JonathanRaiman/5a07eadc0e180ca944dee5f6485695e1 to your computer and use it in GitHub Desktop.
Convert CUDNN LSTM to Dynamic RNN
"""
Little script demonstration how to run cudnn rnns
without cudnn using dynamic rnn with the same weights
(e.g. train on cudnn, use with dynamic rnn on cpu).
Note: this will run slower than cudnn on a gpu (see below).
Tested on Titan X Pascal:
With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches
with batch size 128.
Network: input size: 127, 2 layer bidirectional LSTM with num_units 200.
"""
import tensorflow as tf
def lstm_activation(inputs, input_h, input_c, W, b, forget_bias, activation,
keep_prob, is_training, cudnn_order=False):
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
cell_inputs = tf.concat(1, [inputs, input_h])
matmulled_inputs = tf.matmul(cell_inputs, W)
lstm_matrix = tf.nn.bias_add(matmulled_inputs, b)
preactiv = tf.split(1, 4, lstm_matrix)
if cudnn_order:
# from CUDNN docs:
# Values 0 and 4 reference the input gate.
# Values 1 and 5 reference the forget gate.
# Values 2 and 6 reference the new memory gate.
# Values 3 and 7 reference the output gate
i, f, j, o = (
preactiv[CUDNN_MAPPING["i"]],
preactiv[CUDNN_MAPPING["f"]],
preactiv[CUDNN_MAPPING["j"]],
preactiv[CUDNN_MAPPING["o"]]
)
else:
i, j, f, o = preactiv
c = (tf.nn.sigmoid(f + forget_bias) * input_c +
tf.nn.sigmoid(i) * activation(j))
activated_c = activation(c)
m = tf.nn.sigmoid(o) * maybe_dropout(
activated_c, keep_prob, is_training
)
return (c, m)
class ParametrizedLSTMCell(tf.nn.rnn_cell.RNNCell):
def __init__(self, weights, biases, hidden_size):
self.weights = weights
self.biases = biases
self.hidden_size = hidden_size
@property
def state_size(self):
return (self.hidden_size, self.hidden_size)
@property
def output_size(self):
return self.hidden_size
def __call__(self, inputs, state, scope=None):
input_h, input_c = state
c, m = lstm_activation(inputs,
input_h=input_h,
input_c=input_c,
keep_prob=1.0,
is_training=False,
forget_bias=0.0,
b=self.biases,
W=self.weights,
cudnn_order=True,
activation=tf.nn.tanh)
return m, (m, c)
def cudnn_lstm_parameter_size(input_size, hidden_size):
"""Number of parameters in a single CuDNN LSTM cell."""
biases = 8 * hidden_size
weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size)
return biases + weights
def direction_to_num_directions(direction):
if direction == "unidirectional":
return 1
elif direction == "bidirectional":
return 2
else:
raise ValueError("Unknown direction: %r." % (direction,))
def estimate_cudnn_parameter_size(num_layers,
input_size,
hidden_size,
input_mode,
direction):
"""
Compute the number of parameters needed to
construct a stack of LSTMs. Assumes the hidden states
of bidirectional LSTMs are concatenated before being
sent to the next layer up.
"""
num_directions = direction_to_num_directions(direction)
params = 0
isize = input_size
for layer in range(num_layers):
for direction in range(num_directions):
params += cudnn_lstm_parameter_size(
isize, hidden_size
)
isize = hidden_size * num_directions
return params
# cudnn conversion to dynamic RNN:
CUDNN_LAYER_WEIGHT_ORDER = [
"x", "x", "x", "x", "h", "h", "h", "h"
]
CUDNN_LAYER_BIAS_ORDER = [
"bx", "bx", "bx", "bx", "bh", "bh", "bh", "bh"
]
CUDNN_TRANSPOSED = True
CUDNN_MAPPING = {"i": 0, "f": 1, "j": 2, "o": 3}
def consume_biases_direction(params, old_offset, hidden_size, isize):
offset = old_offset
layer_biases_x = []
layer_biases_h = []
for piece in CUDNN_LAYER_BIAS_ORDER:
if piece == "bx":
layer_biases_x.append(
params[offset:offset + hidden_size]
)
offset += hidden_size
elif piece == "bh":
layer_biases_h.append(
params[offset:offset + hidden_size]
)
offset += hidden_size
else:
raise ValueError("Unknown cudnn piece %r." % (piece,))
b = tf.concat(0, layer_biases_x) + tf.concat(0, layer_biases_h)
return b, offset
def consume_weights_direction(params, old_offset, hidden_size, isize):
offset = old_offset
layer_weights_x = []
layer_weights_h = []
for piece in CUDNN_LAYER_WEIGHT_ORDER:
if piece == "x":
layer_weights_x.append(
tf.reshape(
params[offset:offset + hidden_size * isize],
[hidden_size, isize] if CUDNN_TRANSPOSED else [isize, hidden_size]
)
)
offset += hidden_size * isize
elif piece == "h":
layer_weights_h.append(
tf.reshape(
params[offset:offset + hidden_size * hidden_size],
[hidden_size, hidden_size]
)
)
offset += hidden_size * hidden_size
else:
raise ValueError("Unknown cudnn piece %r." % (piece,))
if CUDNN_TRANSPOSED:
W_T = tf.concat(1, [tf.concat(0, layer_weights_x), tf.concat(0, layer_weights_h)])
W = tf.transpose(W_T)
else:
W = tf.concat(0, [tf.concat(1, layer_weights_x), tf.concat(1, layer_weights_h)])
return W, offset
def decompose_layer_params(params, num_layers,
hidden_size, cell_input_size,
input_mode, direction):
"""Produce a list of pairs of the form (W, b) for use
in a dynamic RNN from an opaque parameter tensor given by cudnn."""
if input_mode != "linear_input":
raise ValueError("Only input_mode == linear_input supported for now.")
num_directions = direction_to_num_directions(direction)
offset = 0
all_weights = [[[] for j in range(num_directions)] for i in range(num_layers)]
isize = cell_input_size
for layer in range(num_layers):
for direction in range(num_directions):
W, offset = consume_weights_direction(
params,
old_offset=offset,
hidden_size=hidden_size,
isize=isize
)
all_weights[layer][direction].append(W)
isize = hidden_size * num_directions
isize = cell_input_size
for layer in range(num_layers):
for direction in range(num_directions):
b, offset = consume_biases_direction(
params,
old_offset=offset,
hidden_size=hidden_size,
isize=isize
)
all_weights[layer][direction].append(b)
isize = hidden_size * num_directions
return all_weights
class FakeCudnnLSTM(object):
def __init__(self, num_layers, hidden_size,
cell_input_size, input_mode, direction):
self.num_layers = num_layers
self.hidden_size = hidden_size
self.cell_input_size = cell_input_size
self.input_mode = input_mode
self.direction = direction
def __call__(self,
inputs,
input_h,
input_c,
params,
is_training=True):
layer_params = decompose_layer_params(
params,
num_layers=self.num_layers,
hidden_size=self.hidden_size,
cell_input_size=self.cell_input_size,
input_mode=self.input_mode,
direction=self.direction
)
layer_inputs = inputs
cell_idx = 0
for layer_param in layer_params:
hidden_fwd_bwd = []
final_output_c = []
final_output_h = []
for idx, (W, b) in enumerate(layer_param):
if idx == 1:
layer_inputs = tf.reverse(layer_inputs, [True, False, False])
hiddens, (output_h, output_c) = tf.nn.dynamic_rnn(
cell=ParametrizedLSTMCell(W, b, self.hidden_size),
inputs=layer_inputs,
dtype=inputs.dtype,
time_major=True,
initial_state=(input_h[cell_idx], input_c[cell_idx])
)
if idx == 1:
hiddens = tf.reverse(hiddens, [True, False, False])
hidden_fwd_bwd.append(hiddens)
cell_idx += 1
if len(hidden_fwd_bwd) > 1:
layer_inputs = tf.concat(2, hidden_fwd_bwd)
final_output_c = tf.concat(1, [final_output_c])
final_output_h = tf.concat(1, [final_output_h])
else:
layer_inputs = hidden_fwd_bwd[0]
final_output_c = final_output_c[0]
final_output_h = final_output_h[0]
return layer_inputs, final_output_h, final_output_c
def build_model(inputs, num_layers, hidden_size,
direction="bidirectional",
faux_cudnn=True):
"""
Run an RNN on inputs using either cudnn or dynamic RNN.
Arguments:
inputs: tensor [time, batch, features], dtype tf.float32
num_layers: int
hidden_size: int
direction: str, bidirectional or unidirectional
faux_cudnn: bool; Set faux_cudnn to True to use a dynamic RNN
with the same weights as a CuDNN LSTM.
"""
input_size = inputs.get_shape()[-1].value
est_size = estimate_cudnn_parameter_size(
num_layers=num_layers,
hidden_size=hidden_size,
input_size=input_size,
input_mode="linear_input",
direction=direction
)
if faux_cudnn:
# dynamic rnn route:
cudnn_cell = FakeCudnnLSTM(
num_layers,
hidden_size,
input_size,
input_mode="linear_input",
direction=direction
)
else:
# cudnn route:
cudnn_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers,
hidden_size,
input_size,
input_mode="linear_input",
direction=direction
)
# build parameters the opaque way:
cudnn_params = tf.get_variable(
"RNNParams",
shape=[est_size],
dtype=dtype,
initializer=tf.contrib.layers.variance_scaling_initializer()
)
# start out with a zero state
init_state = tf.tile(
tf.zeros(
[
2 * num_layers, 1, hidden_size
],
dtype=dtype
),
[1, tf.shape(embed)[1], 1]
)
hiddens, output_h, output_c = cudnn_cell(
embed,
input_h=init_state,
input_c=init_state,
params=cudnn_params,
is_training=True
)
return hiddens
@freesouls
Copy link

freesouls commented Sep 30, 2017

hi, there are so many errors, when trying to run this scripts, for example, dtype, or embed are not defined in build_model, arguments' orders in tf.concat, tf.split are wrong according to the API docs(tf.1.1~tf.1.3), I am wondering how can you possiblly run this code? can you upload the actual code you use, or can you fixed it and give a more clear example?

thanks in advance!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment