Last active
March 22, 2019 11:49
-
-
Save JonathanRaiman/5a07eadc0e180ca944dee5f6485695e1 to your computer and use it in GitHub Desktop.
Convert CUDNN LSTM to Dynamic RNN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Little script demonstration how to run cudnn rnns | |
without cudnn using dynamic rnn with the same weights | |
(e.g. train on cudnn, use with dynamic rnn on cpu). | |
Note: this will run slower than cudnn on a gpu (see below). | |
Tested on Titan X Pascal: | |
With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches | |
with batch size 128. | |
Network: input size: 127, 2 layer bidirectional LSTM with num_units 200. | |
""" | |
import tensorflow as tf | |
def lstm_activation(inputs, input_h, input_c, W, b, forget_bias, activation, | |
keep_prob, is_training, cudnn_order=False): | |
# i = input_gate, j = new_input, f = forget_gate, o = output_gate | |
cell_inputs = tf.concat(1, [inputs, input_h]) | |
matmulled_inputs = tf.matmul(cell_inputs, W) | |
lstm_matrix = tf.nn.bias_add(matmulled_inputs, b) | |
preactiv = tf.split(1, 4, lstm_matrix) | |
if cudnn_order: | |
# from CUDNN docs: | |
# Values 0 and 4 reference the input gate. | |
# Values 1 and 5 reference the forget gate. | |
# Values 2 and 6 reference the new memory gate. | |
# Values 3 and 7 reference the output gate | |
i, f, j, o = ( | |
preactiv[CUDNN_MAPPING["i"]], | |
preactiv[CUDNN_MAPPING["f"]], | |
preactiv[CUDNN_MAPPING["j"]], | |
preactiv[CUDNN_MAPPING["o"]] | |
) | |
else: | |
i, j, f, o = preactiv | |
c = (tf.nn.sigmoid(f + forget_bias) * input_c + | |
tf.nn.sigmoid(i) * activation(j)) | |
activated_c = activation(c) | |
m = tf.nn.sigmoid(o) * maybe_dropout( | |
activated_c, keep_prob, is_training | |
) | |
return (c, m) | |
class ParametrizedLSTMCell(tf.nn.rnn_cell.RNNCell): | |
def __init__(self, weights, biases, hidden_size): | |
self.weights = weights | |
self.biases = biases | |
self.hidden_size = hidden_size | |
@property | |
def state_size(self): | |
return (self.hidden_size, self.hidden_size) | |
@property | |
def output_size(self): | |
return self.hidden_size | |
def __call__(self, inputs, state, scope=None): | |
input_h, input_c = state | |
c, m = lstm_activation(inputs, | |
input_h=input_h, | |
input_c=input_c, | |
keep_prob=1.0, | |
is_training=False, | |
forget_bias=0.0, | |
b=self.biases, | |
W=self.weights, | |
cudnn_order=True, | |
activation=tf.nn.tanh) | |
return m, (m, c) | |
def cudnn_lstm_parameter_size(input_size, hidden_size): | |
"""Number of parameters in a single CuDNN LSTM cell.""" | |
biases = 8 * hidden_size | |
weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size) | |
return biases + weights | |
def direction_to_num_directions(direction): | |
if direction == "unidirectional": | |
return 1 | |
elif direction == "bidirectional": | |
return 2 | |
else: | |
raise ValueError("Unknown direction: %r." % (direction,)) | |
def estimate_cudnn_parameter_size(num_layers, | |
input_size, | |
hidden_size, | |
input_mode, | |
direction): | |
""" | |
Compute the number of parameters needed to | |
construct a stack of LSTMs. Assumes the hidden states | |
of bidirectional LSTMs are concatenated before being | |
sent to the next layer up. | |
""" | |
num_directions = direction_to_num_directions(direction) | |
params = 0 | |
isize = input_size | |
for layer in range(num_layers): | |
for direction in range(num_directions): | |
params += cudnn_lstm_parameter_size( | |
isize, hidden_size | |
) | |
isize = hidden_size * num_directions | |
return params | |
# cudnn conversion to dynamic RNN: | |
CUDNN_LAYER_WEIGHT_ORDER = [ | |
"x", "x", "x", "x", "h", "h", "h", "h" | |
] | |
CUDNN_LAYER_BIAS_ORDER = [ | |
"bx", "bx", "bx", "bx", "bh", "bh", "bh", "bh" | |
] | |
CUDNN_TRANSPOSED = True | |
CUDNN_MAPPING = {"i": 0, "f": 1, "j": 2, "o": 3} | |
def consume_biases_direction(params, old_offset, hidden_size, isize): | |
offset = old_offset | |
layer_biases_x = [] | |
layer_biases_h = [] | |
for piece in CUDNN_LAYER_BIAS_ORDER: | |
if piece == "bx": | |
layer_biases_x.append( | |
params[offset:offset + hidden_size] | |
) | |
offset += hidden_size | |
elif piece == "bh": | |
layer_biases_h.append( | |
params[offset:offset + hidden_size] | |
) | |
offset += hidden_size | |
else: | |
raise ValueError("Unknown cudnn piece %r." % (piece,)) | |
b = tf.concat(0, layer_biases_x) + tf.concat(0, layer_biases_h) | |
return b, offset | |
def consume_weights_direction(params, old_offset, hidden_size, isize): | |
offset = old_offset | |
layer_weights_x = [] | |
layer_weights_h = [] | |
for piece in CUDNN_LAYER_WEIGHT_ORDER: | |
if piece == "x": | |
layer_weights_x.append( | |
tf.reshape( | |
params[offset:offset + hidden_size * isize], | |
[hidden_size, isize] if CUDNN_TRANSPOSED else [isize, hidden_size] | |
) | |
) | |
offset += hidden_size * isize | |
elif piece == "h": | |
layer_weights_h.append( | |
tf.reshape( | |
params[offset:offset + hidden_size * hidden_size], | |
[hidden_size, hidden_size] | |
) | |
) | |
offset += hidden_size * hidden_size | |
else: | |
raise ValueError("Unknown cudnn piece %r." % (piece,)) | |
if CUDNN_TRANSPOSED: | |
W_T = tf.concat(1, [tf.concat(0, layer_weights_x), tf.concat(0, layer_weights_h)]) | |
W = tf.transpose(W_T) | |
else: | |
W = tf.concat(0, [tf.concat(1, layer_weights_x), tf.concat(1, layer_weights_h)]) | |
return W, offset | |
def decompose_layer_params(params, num_layers, | |
hidden_size, cell_input_size, | |
input_mode, direction): | |
"""Produce a list of pairs of the form (W, b) for use | |
in a dynamic RNN from an opaque parameter tensor given by cudnn.""" | |
if input_mode != "linear_input": | |
raise ValueError("Only input_mode == linear_input supported for now.") | |
num_directions = direction_to_num_directions(direction) | |
offset = 0 | |
all_weights = [[[] for j in range(num_directions)] for i in range(num_layers)] | |
isize = cell_input_size | |
for layer in range(num_layers): | |
for direction in range(num_directions): | |
W, offset = consume_weights_direction( | |
params, | |
old_offset=offset, | |
hidden_size=hidden_size, | |
isize=isize | |
) | |
all_weights[layer][direction].append(W) | |
isize = hidden_size * num_directions | |
isize = cell_input_size | |
for layer in range(num_layers): | |
for direction in range(num_directions): | |
b, offset = consume_biases_direction( | |
params, | |
old_offset=offset, | |
hidden_size=hidden_size, | |
isize=isize | |
) | |
all_weights[layer][direction].append(b) | |
isize = hidden_size * num_directions | |
return all_weights | |
class FakeCudnnLSTM(object): | |
def __init__(self, num_layers, hidden_size, | |
cell_input_size, input_mode, direction): | |
self.num_layers = num_layers | |
self.hidden_size = hidden_size | |
self.cell_input_size = cell_input_size | |
self.input_mode = input_mode | |
self.direction = direction | |
def __call__(self, | |
inputs, | |
input_h, | |
input_c, | |
params, | |
is_training=True): | |
layer_params = decompose_layer_params( | |
params, | |
num_layers=self.num_layers, | |
hidden_size=self.hidden_size, | |
cell_input_size=self.cell_input_size, | |
input_mode=self.input_mode, | |
direction=self.direction | |
) | |
layer_inputs = inputs | |
cell_idx = 0 | |
for layer_param in layer_params: | |
hidden_fwd_bwd = [] | |
final_output_c = [] | |
final_output_h = [] | |
for idx, (W, b) in enumerate(layer_param): | |
if idx == 1: | |
layer_inputs = tf.reverse(layer_inputs, [True, False, False]) | |
hiddens, (output_h, output_c) = tf.nn.dynamic_rnn( | |
cell=ParametrizedLSTMCell(W, b, self.hidden_size), | |
inputs=layer_inputs, | |
dtype=inputs.dtype, | |
time_major=True, | |
initial_state=(input_h[cell_idx], input_c[cell_idx]) | |
) | |
if idx == 1: | |
hiddens = tf.reverse(hiddens, [True, False, False]) | |
hidden_fwd_bwd.append(hiddens) | |
cell_idx += 1 | |
if len(hidden_fwd_bwd) > 1: | |
layer_inputs = tf.concat(2, hidden_fwd_bwd) | |
final_output_c = tf.concat(1, [final_output_c]) | |
final_output_h = tf.concat(1, [final_output_h]) | |
else: | |
layer_inputs = hidden_fwd_bwd[0] | |
final_output_c = final_output_c[0] | |
final_output_h = final_output_h[0] | |
return layer_inputs, final_output_h, final_output_c | |
def build_model(inputs, num_layers, hidden_size, | |
direction="bidirectional", | |
faux_cudnn=True): | |
""" | |
Run an RNN on inputs using either cudnn or dynamic RNN. | |
Arguments: | |
inputs: tensor [time, batch, features], dtype tf.float32 | |
num_layers: int | |
hidden_size: int | |
direction: str, bidirectional or unidirectional | |
faux_cudnn: bool; Set faux_cudnn to True to use a dynamic RNN | |
with the same weights as a CuDNN LSTM. | |
""" | |
input_size = inputs.get_shape()[-1].value | |
est_size = estimate_cudnn_parameter_size( | |
num_layers=num_layers, | |
hidden_size=hidden_size, | |
input_size=input_size, | |
input_mode="linear_input", | |
direction=direction | |
) | |
if faux_cudnn: | |
# dynamic rnn route: | |
cudnn_cell = FakeCudnnLSTM( | |
num_layers, | |
hidden_size, | |
input_size, | |
input_mode="linear_input", | |
direction=direction | |
) | |
else: | |
# cudnn route: | |
cudnn_cell = tf.contrib.cudnn_rnn.CudnnLSTM( | |
num_layers, | |
hidden_size, | |
input_size, | |
input_mode="linear_input", | |
direction=direction | |
) | |
# build parameters the opaque way: | |
cudnn_params = tf.get_variable( | |
"RNNParams", | |
shape=[est_size], | |
dtype=dtype, | |
initializer=tf.contrib.layers.variance_scaling_initializer() | |
) | |
# start out with a zero state | |
init_state = tf.tile( | |
tf.zeros( | |
[ | |
2 * num_layers, 1, hidden_size | |
], | |
dtype=dtype | |
), | |
[1, tf.shape(embed)[1], 1] | |
) | |
hiddens, output_h, output_c = cudnn_cell( | |
embed, | |
input_h=init_state, | |
input_c=init_state, | |
params=cudnn_params, | |
is_training=True | |
) | |
return hiddens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi, there are so many errors, when trying to run this scripts, for example,
dtype
, orembed
are not defined inbuild_model
, arguments' orders intf.concat
,tf.split
are wrong according to the API docs(tf.1.1~tf.1.3), I am wondering how can you possiblly run this code? can you upload the actual code you use, or can you fixed it and give a more clear example?thanks in advance!