Skip to content

Instantly share code, notes, and snippets.

@albertz
Created February 18, 2022 14:01
Show Gist options
  • Save albertz/39813d93f2690b4c6d7347864a2b4b04 to your computer and use it in GitHub Desktop.
Save albertz/39813d93f2690b4c6d7347864a2b4b04 to your computer and use it in GitHub Desktop.
from returnn.tf.util.data import Dim, batch_dim, single_step_dim, SpatialDim, FeatureDim
use_tensorflow = True
behavior_version = 12
time_dim = SpatialDim('time')
input_dim = FeatureDim('input', 10)
dummy_input_feature_dim = FeatureDim('dummy-input-feature-dim', 1)
filter_dim0_dim = SpatialDim('filter-dim0', 3)
filter_dim1_dim = SpatialDim('filter-dim1', 3)
intermediate_out_sub_sample_dim = FeatureDim('intermediate_out_sub_sample', 14)
conv_subsample_layer_out_spatial_dim0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0')
conv_subsample_layer_out_spatial_dim1_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 5)
filter_dim0_0_dim = SpatialDim('filter-dim0', 3)
filter_dim1_0_dim = SpatialDim('filter-dim1', 3)
out_dim = FeatureDim('out', 14)
conv_subsample_layer_out_spatial_dim0_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0')
conv_subsample_layer_out_spatial_dim1_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 3)
conv_subsample_layer_out_dim = SpatialDim('conv_subsample_layer:out_dim')
ff_dim = FeatureDim('ff', 17)
num_heads_dim = SpatialDim('num_heads', 2)
layers_0_self_att_history_dim = SpatialDim('layers/0/self_att:history')
filter_dim0_1_dim = SpatialDim('filter-dim0', 32)
layers_1_self_att_history_dim = SpatialDim('layers/1/self_att:history')
extern_data = {
'data': {
'dim_tags': (
batch_dim,
time_dim,
input_dim
),
'dtype': 'float32',
'available_for_inference': True
}
}
network = {
'conv_subsample_layer': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'split_dims': {
'class': 'split_dims',
'from': 'base:data:data',
'axis': input_dim,
'dims': (
input_dim,
dummy_input_feature_dim
),
'out_shape': {batch_dim, time_dim, input_dim, dummy_input_feature_dim}
},
'conv_layers.0': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
filter_dim0_dim,
filter_dim1_dim,
dummy_input_feature_dim,
intermediate_out_sub_sample_dim
),
'distribution': 'uniform',
'minval': -0.21081851067789195,
'maxval': 0.21081851067789195,
'dtype': 'float32',
'static': True
},
'conv': {
'class': 'conv',
'from': 'base:split_dims',
'in_dim': dummy_input_feature_dim,
'in_spatial_dims': [
time_dim,
input_dim
],
'out_dim': intermediate_out_sub_sample_dim,
'out_spatial_dims': [
time_dim,
input_dim
],
'filter_size': [3, 3],
'padding': 'same',
'filter': 'filter',
'with_bias': True,
'bias': 'bias',
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim}
},
'output': {
'class': 'copy',
'from': 'conv',
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim}
},
'bias': {
'class': 'variable',
'shape': [
intermediate_out_sub_sample_dim
],
'param_name': 'param',
'init': 0.0
},
'filter': {
'class': 'variable',
'shape': [
filter_dim0_dim,
filter_dim1_dim,
dummy_input_feature_dim,
intermediate_out_sub_sample_dim
],
'param_name': 'param',
'init_by_layer': 'random'
}
},
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim},
'name_scope': 'conv_layers/0'
},
'relu': {
'class': 'activation',
'from': 'conv_layers.0/conv',
'activation': 'relu',
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim}
},
'pool': {
'class': 'pool',
'from': 'relu',
'mode': 'max',
'pool_size': (2, 2),
'padding': 'same',
'in_spatial_dims': [
time_dim,
input_dim
],
'out_spatial_dims': [
conv_subsample_layer_out_spatial_dim0_dim,
conv_subsample_layer_out_spatial_dim1_dim
],
'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'pool',
'dropout': 0.1,
'dropout_axis': intermediate_out_sub_sample_dim,
'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim}
},
'conv_layers.1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
filter_dim0_0_dim,
filter_dim1_0_dim,
intermediate_out_sub_sample_dim,
out_dim
),
'distribution': 'uniform',
'minval': -0.1543033499620919,
'maxval': 0.1543033499620919,
'dtype': 'float32',
'static': True
},
'conv': {
'class': 'conv',
'from': 'base:dropout_0',
'in_dim': intermediate_out_sub_sample_dim,
'in_spatial_dims': [
conv_subsample_layer_out_spatial_dim0_dim,
conv_subsample_layer_out_spatial_dim1_dim
],
'out_dim': out_dim,
'out_spatial_dims': [
conv_subsample_layer_out_spatial_dim0_dim,
conv_subsample_layer_out_spatial_dim1_dim
],
'filter_size': [3, 3],
'padding': 'same',
'filter': 'filter',
'with_bias': True,
'bias': 'bias',
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim}
},
'output': {
'class': 'copy',
'from': 'conv',
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim}
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
},
'filter': {
'class': 'variable',
'shape': [
filter_dim0_0_dim,
filter_dim1_0_dim,
intermediate_out_sub_sample_dim,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim},
'name_scope': 'conv_layers/1'
},
'relu_0': {
'class': 'activation',
'from': 'conv_layers.1/conv',
'activation': 'relu',
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim}
},
'pool_0': {
'class': 'pool',
'from': 'relu_0',
'mode': 'max',
'pool_size': (2, 2),
'padding': 'same',
'in_spatial_dims': [
conv_subsample_layer_out_spatial_dim0_dim,
conv_subsample_layer_out_spatial_dim1_dim
],
'out_spatial_dims': [
conv_subsample_layer_out_spatial_dim0_0_dim,
conv_subsample_layer_out_spatial_dim1_0_dim
],
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim}
},
'dropout_1': {
'class': 'dropout',
'from': 'pool_0',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim}
},
'merge_dims': {
'class': 'merge_dims',
'from': 'dropout_1',
'axes': [
conv_subsample_layer_out_spatial_dim0_0_dim,
conv_subsample_layer_out_spatial_dim1_0_dim
],
'out_dim': conv_subsample_layer_out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'merge_dims',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'linear': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim.copy(match_priority=1),
out_dim
),
'distribution': 'uniform',
'minval': -0.4629100498862757,
'maxval': 0.4629100498862757,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:conv_subsample_layer/merge_dims', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'dot',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim.copy(match_priority=1),
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'linear',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layers': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'0': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'layer_norm': {
'class': 'layer_norm',
'from': 'base:base:dropout_0',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'ffn1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'linear_ff': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
ff_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
ff_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
ff_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'swish': {
'class': 'activation',
'from': 'linear_ff',
'activation': 'swish',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'swish',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'linear_out': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
ff_dim,
out_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:dropout_0', 'weight'],
'reduce': ff_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
ff_dim,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'linear_out',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'ffn1',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'constant': {'class': 'constant', 'value': 0.5},
'mul': {
'class': 'combine',
'from': ['constant', 'dropout_0'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['mul', 'base:base:dropout_0'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_0': {
'class': 'layer_norm',
'from': 'add',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'self_att': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'qkv': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
3 * out_dim
),
'distribution': 'uniform',
'minval': -0.32732683535398854,
'maxval': 0.32732683535398854,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_0', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
3 * out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
3 * out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'qkv_split_dims': {
'class': 'split_dims',
'from': 'qkv',
'axis': 3 * out_dim,
'dims': (
num_heads_dim,
3 * out_dim.div_left(num_heads_dim)
),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)}
},
'qkv_split': {
'class': 'split',
'from': 'qkv_split_dims',
'axis': 3 * out_dim.div_left(num_heads_dim),
'out_dims': (
out_dim.div_left(num_heads_dim),
out_dim.div_left(num_heads_dim),
out_dim.div_left(num_heads_dim)
),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)}
},
'k_new_dim': {
'class': 'reinterpret_data',
'set_dim_tags': {
conv_subsample_layer_out_dim: layers_0_self_att_history_dim
},
'from': 'qkv_split/1',
'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)}
},
'v_new_dim': {
'class': 'reinterpret_data',
'set_dim_tags': {
conv_subsample_layer_out_dim: layers_0_self_att_history_dim
},
'from': 'qkv_split/2',
'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)}
},
'dot_attention': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'constant': {'class': 'constant', 'value': 0.37796447300922725},
'mul': {
'class': 'combine',
'from': ['base:qkv_split/0', 'constant'],
'kind': 'mul',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'energy': {
'class': 'dot',
'from': ['mul', 'base:k_new_dim'],
'reduce': out_dim.div_left(num_heads_dim),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim}
},
'att_weights': {
'class': 'softmax_over_spatial',
'from': 'energy',
'axis': layers_0_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim}
},
'dropout': {
'class': 'dropout',
'from': 'att_weights',
'dropout': 0.1,
'dropout_axis': layers_0_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim}
},
'att': {
'class': 'dot',
'from': ['dropout', 'base:v_new_dim'],
'reduce': layers_0_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'output': {
'class': 'copy',
'from': 'att',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'output_0': {
'class': 'merge_dims',
'from': 'dot_attention',
'axes': (
num_heads_dim,
out_dim.div_left(num_heads_dim)
),
'out_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'output_0',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_0': {
'class': 'combine',
'from': ['self_att', 'add'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_1': {
'class': 'layer_norm',
'from': 'add_0',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'conv_block': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'positionwise_conv1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
2 * out_dim
),
'distribution': 'uniform',
'minval': -0.3779644730092272,
'maxval': 0.3779644730092272,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_1', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
2 * out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
2 * out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'glu': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'split': {
'class': 'split',
'from': 'base:positionwise_conv1',
'axis': 2 * out_dim,
'out_dims': [
out_dim,
out_dim
],
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'sigmoid': {
'class': 'activation',
'from': 'split/1',
'activation': 'sigmoid',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'mul': {
'class': 'combine',
'from': ['split/0', 'sigmoid'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'depthwise_conv': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
filter_dim0_1_dim,
out_dim // 14,
out_dim
),
'distribution': 'uniform',
'minval': -0.11180339887498948,
'maxval': 0.11180339887498948,
'dtype': 'float32',
'static': True
},
'conv': {
'class': 'conv',
'from': 'base:glu',
'in_dim': out_dim,
'in_spatial_dims': [
conv_subsample_layer_out_dim
],
'out_dim': out_dim,
'out_spatial_dims': [
conv_subsample_layer_out_dim
],
'filter_size': [32],
'padding': 'same',
'groups': 14,
'filter': 'filter',
'with_bias': True,
'bias': 'bias',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'conv',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
},
'filter': {
'class': 'variable',
'shape': [
filter_dim0_1_dim,
out_dim // 14,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'norm': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'batch_norm': {
'class': 'batch_norm',
'from': 'base:depthwise_conv/conv',
'in_dim': out_dim,
'use_std': True,
'use_shift': True,
'param_version': 2,
'reuse_params': {
'map': {
'batch_norm/v2_mean': {'layer_output': 'running_mean'},
'batch_norm/v2_variance': {'layer_output': 'running_variance'},
'batch_norm/v2_gamma': {'layer_output': 'gamma'},
'batch_norm/v2_beta': {'layer_output': 'beta'}
}
},
'momentum': 0.1,
'epsilon': 0.001,
'masked_time': False,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'batch_norm',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'beta': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
},
'gamma': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 1.0
},
'running_mean': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'trainable': False,
'init': 0.0
},
'running_variance': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'trainable': False,
'init': 1.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'swish': {
'class': 'activation',
'from': 'norm',
'activation': 'swish',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'positionwise_conv2': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim.copy(match_priority=1),
out_dim
),
'distribution': 'uniform',
'minval': -0.4629100498862757,
'maxval': 0.4629100498862757,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:swish', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim.copy(match_priority=1),
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'positionwise_conv2',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_1': {
'class': 'dropout',
'from': 'conv_block',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_1': {
'class': 'combine',
'from': ['dropout_1', 'add_0'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_2': {
'class': 'layer_norm',
'from': 'add_1',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'ffn2': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'linear_ff': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
ff_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_2', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
ff_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
ff_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'swish': {
'class': 'activation',
'from': 'linear_ff',
'activation': 'swish',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'swish',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'linear_out': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
ff_dim,
out_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:dropout_0', 'weight'],
'reduce': ff_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
ff_dim,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'linear_out',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_2': {
'class': 'dropout',
'from': 'ffn2',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'constant_0': {'class': 'constant', 'value': 0.5},
'mul_0': {
'class': 'combine',
'from': ['constant_0', 'dropout_2'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_2': {
'class': 'combine',
'from': ['mul_0', 'add_1'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_3': {
'class': 'layer_norm',
'from': 'add_2',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'layer_norm_3',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'layer_norm': {
'class': 'layer_norm',
'from': 'base:0',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'ffn1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'linear_ff': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
ff_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
ff_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
ff_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'swish': {
'class': 'activation',
'from': 'linear_ff',
'activation': 'swish',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'swish',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'linear_out': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
ff_dim,
out_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:dropout_0', 'weight'],
'reduce': ff_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
ff_dim,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'linear_out',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'ffn1',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'constant': {'class': 'constant', 'value': 0.5},
'mul': {
'class': 'combine',
'from': ['constant', 'dropout_0'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['mul', 'base:0'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_0': {
'class': 'layer_norm',
'from': 'add',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'self_att': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'qkv': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
3 * out_dim
),
'distribution': 'uniform',
'minval': -0.32732683535398854,
'maxval': 0.32732683535398854,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_0', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
3 * out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
3 * out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim}
},
'qkv_split_dims': {
'class': 'split_dims',
'from': 'qkv',
'axis': 3 * out_dim,
'dims': (
num_heads_dim,
3 * out_dim.div_left(num_heads_dim)
),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)}
},
'qkv_split': {
'class': 'split',
'from': 'qkv_split_dims',
'axis': 3 * out_dim.div_left(num_heads_dim),
'out_dims': (
out_dim.div_left(num_heads_dim),
out_dim.div_left(num_heads_dim),
out_dim.div_left(num_heads_dim)
),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)}
},
'k_new_dim': {
'class': 'reinterpret_data',
'set_dim_tags': {
conv_subsample_layer_out_dim: layers_1_self_att_history_dim
},
'from': 'qkv_split/1',
'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)}
},
'v_new_dim': {
'class': 'reinterpret_data',
'set_dim_tags': {
conv_subsample_layer_out_dim: layers_1_self_att_history_dim
},
'from': 'qkv_split/2',
'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)}
},
'dot_attention': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'constant': {'class': 'constant', 'value': 0.37796447300922725},
'mul': {
'class': 'combine',
'from': ['base:qkv_split/0', 'constant'],
'kind': 'mul',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'energy': {
'class': 'dot',
'from': ['mul', 'base:k_new_dim'],
'reduce': out_dim.div_left(num_heads_dim),
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim}
},
'att_weights': {
'class': 'softmax_over_spatial',
'from': 'energy',
'axis': layers_1_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim}
},
'dropout': {
'class': 'dropout',
'from': 'att_weights',
'dropout': 0.1,
'dropout_axis': layers_1_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim}
},
'att': {
'class': 'dot',
'from': ['dropout', 'base:v_new_dim'],
'reduce': layers_1_self_att_history_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'output': {
'class': 'copy',
'from': 'att',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)}
},
'output_0': {
'class': 'merge_dims',
'from': 'dot_attention',
'axes': (
num_heads_dim,
out_dim.div_left(num_heads_dim)
),
'out_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'output_0',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_0': {
'class': 'combine',
'from': ['self_att', 'add'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_1': {
'class': 'layer_norm',
'from': 'add_0',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'conv_block': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'positionwise_conv1': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
2 * out_dim
),
'distribution': 'uniform',
'minval': -0.3779644730092272,
'maxval': 0.3779644730092272,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_1', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
2 * out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
2 * out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'glu': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'split': {
'class': 'split',
'from': 'base:positionwise_conv1',
'axis': 2 * out_dim,
'out_dims': [
out_dim,
out_dim
],
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim}
},
'sigmoid': {
'class': 'activation',
'from': 'split/1',
'activation': 'sigmoid',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'mul': {
'class': 'combine',
'from': ['split/0', 'sigmoid'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'depthwise_conv': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
filter_dim0_1_dim,
out_dim // 14,
out_dim
),
'distribution': 'uniform',
'minval': -0.11180339887498948,
'maxval': 0.11180339887498948,
'dtype': 'float32',
'static': True
},
'conv': {
'class': 'conv',
'from': 'base:glu',
'in_dim': out_dim,
'in_spatial_dims': [
conv_subsample_layer_out_dim
],
'out_dim': out_dim,
'out_spatial_dims': [
conv_subsample_layer_out_dim
],
'filter_size': [32],
'padding': 'same',
'groups': 14,
'filter': 'filter',
'with_bias': True,
'bias': 'bias',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'conv',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
},
'filter': {
'class': 'variable',
'shape': [
filter_dim0_1_dim,
out_dim // 14,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'norm': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'batch_norm': {
'class': 'batch_norm',
'from': 'base:depthwise_conv/conv',
'in_dim': out_dim,
'use_std': True,
'use_shift': True,
'param_version': 2,
'reuse_params': {
'map': {
'batch_norm/v2_mean': {'layer_output': 'running_mean'},
'batch_norm/v2_variance': {'layer_output': 'running_variance'},
'batch_norm/v2_gamma': {'layer_output': 'gamma'},
'batch_norm/v2_beta': {'layer_output': 'beta'}
}
},
'momentum': 0.1,
'epsilon': 0.001,
'masked_time': False,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'batch_norm',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'beta': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
},
'gamma': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 1.0
},
'running_mean': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'trainable': False,
'init': 0.0
},
'running_variance': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'trainable': False,
'init': 1.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'swish': {
'class': 'activation',
'from': 'norm',
'activation': 'swish',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'positionwise_conv2': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim.copy(match_priority=1),
out_dim
),
'distribution': 'uniform',
'minval': -0.4629100498862757,
'maxval': 0.4629100498862757,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:swish', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim.copy(match_priority=1),
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'positionwise_conv2',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_1': {
'class': 'dropout',
'from': 'conv_block',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_1': {
'class': 'combine',
'from': ['dropout_1', 'add_0'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_2': {
'class': 'layer_norm',
'from': 'add_1',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'ffn2': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'linear_ff': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
out_dim,
ff_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:base:layer_norm_2', 'weight'],
'reduce': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'weight': {
'class': 'variable',
'shape': [
out_dim,
ff_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
ff_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'swish': {
'class': 'activation',
'from': 'linear_ff',
'activation': 'swish',
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'dropout_0': {
'class': 'dropout',
'from': 'swish',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim}
},
'linear_out': {
'class': 'subnetwork',
'from': [],
'subnetwork': {
'random': {
'class': 'random',
'shape': (
ff_dim,
out_dim
),
'distribution': 'uniform',
'minval': -0.43994134506405985,
'maxval': 0.43994134506405985,
'dtype': 'float32',
'static': True
},
'dot': {
'class': 'dot',
'from': ['base:dropout_0', 'weight'],
'reduce': ff_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add': {
'class': 'combine',
'from': ['dot', 'bias'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'weight': {
'class': 'variable',
'shape': [
ff_dim,
out_dim
],
'param_name': 'param',
'init_by_layer': 'random'
},
'bias': {
'class': 'variable',
'shape': [
out_dim
],
'param_name': 'param',
'init': 0.0
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'linear_out',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'dropout_2': {
'class': 'dropout',
'from': 'ffn2',
'dropout': 0.1,
'dropout_axis': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'constant_0': {'class': 'constant', 'value': 0.5},
'mul_0': {
'class': 'combine',
'from': ['constant_0', 'dropout_2'],
'kind': 'mul',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'add_2': {
'class': 'combine',
'from': ['mul_0', 'add_1'],
'kind': 'add',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'layer_norm_3': {
'class': 'layer_norm',
'from': 'add_2',
'in_dim': out_dim,
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'layer_norm_3',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': '1',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
},
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
},
'output': {
'class': 'copy',
'from': 'layers',
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment