-
-
Save Max-Ryujin/f7323769568c9803bd19e3eba623c2d1 to your computer and use it in GitHub Desktop.
returnn.config for the setup /u/maximilian.kannen/setups/20230406_feat/recipe/i6_experiments/users/vieting/experiments/switchboard/ctc/feat/experiments.run_scf_baseline_big
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!rnn.py | |
import sys | |
sys.setrecursionlimit(3000) | |
def _mask(x, batch_axis, axis, pos, max_amount): | |
""" | |
:param tf.Tensor x: (batch,time,feature) | |
:param int batch_axis: | |
:param int axis: | |
:param tf.Tensor pos: (batch,) | |
:param int|tf.Tensor max_amount: inclusive | |
""" | |
import tensorflow as tf | |
ndim = x.get_shape().ndims | |
n_batch = tf.shape(x)[batch_axis] | |
dim = tf.shape(x)[axis] | |
amount = tf.random.uniform( | |
shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32 | |
) | |
pos2 = tf.math.minimum(pos + amount, dim) | |
idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) | |
pos_bc = tf.expand_dims(pos, 1) # (batch,1) | |
pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) | |
cond = tf.math.logical_and( | |
tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc) | |
) # (batch,dim) | |
if batch_axis > axis: | |
cond = tf.transpose(cond) # (dim,batch) | |
cond = tf.reshape( | |
cond, [tf.shape(x)[i] if i in (batch_axis, axis) else 1 for i in range(ndim)] | |
) | |
from TFUtil import where_bc | |
x = where_bc(cond, 0.0, x) | |
return x | |
def random_mask(x, batch_axis, axis, min_num, max_num, max_dims): | |
""" | |
:param tf.Tensor x: (batch,time,feature) | |
:param int batch_axis: | |
:param int axis: | |
:param int|tf.Tensor min_num: | |
:param int|tf.Tensor max_num: inclusive | |
:param int|tf.Tensor max_dims: inclusive | |
""" | |
import tensorflow as tf | |
n_batch = tf.shape(x)[batch_axis] | |
if isinstance(min_num, int) and isinstance(max_num, int) and min_num == max_num: | |
num = min_num | |
else: | |
num = tf.random.uniform( | |
shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32 | |
) | |
# https://github.com/tensorflow/tensorflow/issues/9260 | |
# https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ | |
z = -tf.math.log( | |
-tf.math.log(tf.random.uniform((n_batch, tf.shape(x)[axis]), 0, 1)) | |
) | |
_, indices = tf.math.top_k(z, num if isinstance(num, int) else tf.reduce_max(num)) | |
# indices should be sorted, and of shape (batch,num), entries (int32) in [0,dim) | |
# indices = tf.Print(indices, ["indices", indices, tf.shape(indices)]) | |
if isinstance(num, int): | |
for i in range(num): | |
x = _mask( | |
x, | |
batch_axis=batch_axis, | |
axis=axis, | |
pos=indices[:, i], | |
max_amount=max_dims, | |
) | |
else: | |
_, x = tf.while_loop( | |
cond=lambda i, _: tf.less(i, tf.reduce_max(num)), | |
body=lambda i, x: ( | |
i + 1, | |
tf.where( | |
tf.expand_dims(tf.expand_dims(tf.less(i, num), axis=-1), axis=-1), | |
_mask( | |
x, | |
batch_axis=batch_axis, | |
axis=axis, | |
pos=indices[:, i], | |
max_amount=max_dims, | |
), | |
x, | |
), | |
), | |
loop_vars=(0, x), | |
) | |
return x | |
def transform(data, max_time_num, max_time, max_feature_num, max_feature, network): | |
# halved before this step | |
conservative_step = 2000 | |
x = data.placeholder | |
import tensorflow as tf | |
step = network.global_train_step | |
increase_flag = tf.where(tf.greater_equal(step, conservative_step), 0, 1) | |
def get_masked(): | |
x_masked = x | |
x_masked = random_mask( | |
x_masked, | |
batch_axis=data.batch_dim_axis, | |
axis=data.time_dim_axis, | |
min_num=0, | |
max_num=tf.maximum( | |
tf.shape(x)[data.time_dim_axis] // int(1.0 / 0.7 * max_time), | |
max_time_num, | |
) | |
// (1 + increase_flag), | |
max_dims=max_time, | |
) | |
x_masked = random_mask( | |
x_masked, | |
batch_axis=data.batch_dim_axis, | |
axis=data.feature_dim_axis, | |
min_num=0, | |
max_num=max_feature_num // (1 + increase_flag), | |
max_dims=max_feature, | |
) | |
return x_masked | |
x = network.cond_on_train(get_masked, lambda: x) | |
return x | |
batch_size = {"classes": 5000, "data": 400000} | |
cache_size = "0" | |
cleanup_old_models = {"keep_last_n": 5, "keep_best_n": 5, "keep": [450]} | |
debug_print_layer_output_template = True | |
dev = { | |
"class": "OggZipDataset", | |
"audio": {"features": "raw", "peak_normalization": True}, | |
"partition_epoch": 1, | |
"path": [ | |
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip" | |
], | |
"seq_ordering": "sorted_reverse", | |
"use_cache_manager": True, | |
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.Fzh6DWEkIA5y/output/segments.1", | |
"targets": None, | |
} | |
device = "gpu" | |
eval_datasets = { | |
"devtrain": { | |
"class": "OggZipDataset", | |
"audio": {"features": "raw", "peak_normalization": True}, | |
"partition_epoch": 1, | |
"path": [ | |
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip" | |
], | |
"seq_ordering": "sorted_reverse", | |
"use_cache_manager": True, | |
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/text/processing/TailJob.RiSM6fe2XipO/output/out.gz", | |
"targets": None, | |
} | |
} | |
extern_data = {"data": {"dim": 1}} | |
gradient_noise = 0.0 | |
learning_rate_control = "newbob_multi_epoch" | |
learning_rate_control_min_num_epochs_per_new_lr = 3 | |
learning_rate_control_relative_error_relative_lr = True | |
learning_rate_file = "learning_rates" | |
learning_rates = [ | |
1.325e-05, | |
1.539861111111111e-05, | |
1.754722222222222e-05, | |
1.9695833333333335e-05, | |
2.1844444444444446e-05, | |
2.3993055555555557e-05, | |
2.6141666666666667e-05, | |
2.8290277777777778e-05, | |
3.043888888888889e-05, | |
3.25875e-05, | |
3.473611111111111e-05, | |
3.688472222222222e-05, | |
3.903333333333334e-05, | |
4.118194444444444e-05, | |
4.333055555555556e-05, | |
4.547916666666666e-05, | |
4.762777777777778e-05, | |
4.9776388888888884e-05, | |
5.1925e-05, | |
5.4073611111111106e-05, | |
5.622222222222222e-05, | |
5.837083333333333e-05, | |
6.0519444444444444e-05, | |
6.266805555555555e-05, | |
6.481666666666667e-05, | |
6.696527777777778e-05, | |
6.911388888888889e-05, | |
7.12625e-05, | |
7.341111111111111e-05, | |
7.555972222222221e-05, | |
7.770833333333333e-05, | |
7.985694444444443e-05, | |
8.200555555555555e-05, | |
8.415416666666667e-05, | |
8.630277777777777e-05, | |
8.845138888888889e-05, | |
9.059999999999999e-05, | |
9.274861111111111e-05, | |
9.489722222222221e-05, | |
9.704583333333333e-05, | |
9.919444444444444e-05, | |
0.00010134305555555555, | |
0.00010349166666666666, | |
0.00010564027777777777, | |
0.00010778888888888888, | |
0.0001099375, | |
0.0001120861111111111, | |
0.00011423472222222222, | |
0.00011638333333333333, | |
0.00011853194444444444, | |
0.00012068055555555556, | |
0.00012282916666666666, | |
0.00012497777777777778, | |
0.0001271263888888889, | |
0.000129275, | |
0.0001314236111111111, | |
0.00013357222222222222, | |
0.00013572083333333334, | |
0.00013786944444444443, | |
0.00014001805555555554, | |
0.00014216666666666666, | |
0.00014431527777777778, | |
0.00014646388888888887, | |
0.0001486125, | |
0.0001507611111111111, | |
0.00015290972222222222, | |
0.00015505833333333334, | |
0.00015720694444444443, | |
0.00015935555555555555, | |
0.00016150416666666666, | |
0.00016365277777777778, | |
0.00016580138888888887, | |
0.00016795, | |
0.0001700986111111111, | |
0.00017224722222222222, | |
0.00017439583333333331, | |
0.00017654444444444443, | |
0.00017869305555555555, | |
0.00018084166666666667, | |
0.00018299027777777776, | |
0.00018513888888888887, | |
0.0001872875, | |
0.0001894361111111111, | |
0.00019158472222222223, | |
0.00019373333333333332, | |
0.00019588194444444443, | |
0.00019803055555555555, | |
0.00020017916666666667, | |
0.00020232777777777776, | |
0.00020447638888888888, | |
0.000206625, | |
0.0002087736111111111, | |
0.0002109222222222222, | |
0.00021307083333333332, | |
0.00021521944444444444, | |
0.00021736805555555555, | |
0.00021951666666666667, | |
0.00022166527777777776, | |
0.00022381388888888888, | |
0.0002259625, | |
0.00022811111111111111, | |
0.0002302597222222222, | |
0.00023240833333333332, | |
0.00023455694444444444, | |
0.00023670555555555556, | |
0.00023885416666666665, | |
0.00024100277777777776, | |
0.00024315138888888888, | |
0.0002453, | |
0.0002474486111111111, | |
0.00024959722222222223, | |
0.0002517458333333333, | |
0.00025389444444444447, | |
0.00025604305555555556, | |
0.0002581916666666667, | |
0.0002603402777777778, | |
0.0002624888888888889, | |
0.00026463750000000003, | |
0.0002667861111111111, | |
0.00026893472222222226, | |
0.00027108333333333335, | |
0.00027323194444444444, | |
0.0002753805555555556, | |
0.0002775291666666667, | |
0.00027967777777777777, | |
0.0002818263888888889, | |
0.000283975, | |
0.00028612361111111115, | |
0.00028827222222222224, | |
0.00029042083333333333, | |
0.0002925694444444445, | |
0.00029471805555555556, | |
0.0002968666666666667, | |
0.0002990152777777778, | |
0.0003011638888888889, | |
0.00030331250000000003, | |
0.0003054611111111111, | |
0.0003076097222222222, | |
0.00030975833333333336, | |
0.00031190694444444445, | |
0.0003140555555555556, | |
0.0003162041666666667, | |
0.0003183527777777778, | |
0.0003205013888888889, | |
0.00032265, | |
0.00032479861111111115, | |
0.00032694722222222224, | |
0.00032909583333333333, | |
0.0003312444444444445, | |
0.00033339305555555557, | |
0.00033554166666666666, | |
0.0003376902777777778, | |
0.0003398388888888889, | |
0.00034198750000000004, | |
0.00034413611111111113, | |
0.0003462847222222222, | |
0.00034843333333333336, | |
0.00035058194444444445, | |
0.00035273055555555554, | |
0.0003548791666666667, | |
0.0003570277777777778, | |
0.0003591763888888889, | |
0.000361325, | |
0.0003634736111111111, | |
0.00036562222222222225, | |
0.00036777083333333334, | |
0.0003699194444444445, | |
0.0003720680555555556, | |
0.00037421666666666666, | |
0.0003763652777777778, | |
0.0003785138888888889, | |
0.0003806625, | |
0.00038281111111111113, | |
0.0003849597222222222, | |
0.00038710833333333337, | |
0.00038925694444444446, | |
0.00039140555555555555, | |
0.0003935541666666667, | |
0.0003957027777777778, | |
0.00039785138888888893, | |
0.0004, | |
0.0004, | |
0.00039783333333333337, | |
0.00039566666666666667, | |
0.0003935, | |
0.0003913333333333334, | |
0.0003891666666666667, | |
0.00038700000000000003, | |
0.0003848333333333334, | |
0.0003826666666666667, | |
0.00038050000000000003, | |
0.00037833333333333333, | |
0.0003761666666666667, | |
0.00037400000000000004, | |
0.00037183333333333334, | |
0.0003696666666666667, | |
0.00036750000000000004, | |
0.00036533333333333334, | |
0.0003631666666666667, | |
0.000361, | |
0.00035883333333333335, | |
0.0003566666666666667, | |
0.0003545, | |
0.00035233333333333335, | |
0.0003501666666666667, | |
0.000348, | |
0.00034583333333333335, | |
0.0003436666666666667, | |
0.0003415, | |
0.00033933333333333336, | |
0.0003371666666666667, | |
0.000335, | |
0.00033283333333333336, | |
0.0003306666666666667, | |
0.0003285, | |
0.00032633333333333337, | |
0.0003241666666666667, | |
0.000322, | |
0.0003198333333333334, | |
0.00031766666666666667, | |
0.0003155, | |
0.0003133333333333333, | |
0.0003111666666666667, | |
0.00030900000000000003, | |
0.00030683333333333333, | |
0.0003046666666666667, | |
0.00030250000000000003, | |
0.00030033333333333333, | |
0.0002981666666666667, | |
0.00029600000000000004, | |
0.00029383333333333334, | |
0.0002916666666666667, | |
0.00028950000000000004, | |
0.00028733333333333334, | |
0.0002851666666666667, | |
0.00028300000000000005, | |
0.00028083333333333335, | |
0.0002786666666666667, | |
0.00027650000000000005, | |
0.00027433333333333335, | |
0.0002721666666666667, | |
0.00027000000000000006, | |
0.00026783333333333336, | |
0.00026566666666666666, | |
0.00026350000000000006, | |
0.00026133333333333336, | |
0.00025916666666666666, | |
0.00025700000000000007, | |
0.00025483333333333337, | |
0.00025266666666666666, | |
0.0002505, | |
0.00024833333333333337, | |
0.00024616666666666667, | |
0.00024400000000000002, | |
0.00024183333333333337, | |
0.0002396666666666667, | |
0.00023750000000000003, | |
0.00023533333333333335, | |
0.0002331666666666667, | |
0.00023100000000000003, | |
0.00022883333333333336, | |
0.00022666666666666668, | |
0.00022450000000000004, | |
0.00022233333333333336, | |
0.0002201666666666667, | |
0.00021800000000000004, | |
0.00021583333333333337, | |
0.0002136666666666667, | |
0.00021150000000000002, | |
0.00020933333333333337, | |
0.0002071666666666667, | |
0.00020500000000000002, | |
0.00020283333333333338, | |
0.0002006666666666667, | |
0.00019850000000000003, | |
0.00019633333333333335, | |
0.0001941666666666667, | |
0.00019200000000000003, | |
0.00018983333333333336, | |
0.0001876666666666667, | |
0.00018550000000000004, | |
0.00018333333333333336, | |
0.0001811666666666667, | |
0.00017900000000000004, | |
0.00017683333333333337, | |
0.0001746666666666667, | |
0.00017250000000000005, | |
0.00017033333333333337, | |
0.0001681666666666667, | |
0.00016600000000000002, | |
0.00016383333333333338, | |
0.0001616666666666667, | |
0.00015950000000000003, | |
0.00015733333333333338, | |
0.00015516666666666668, | |
0.00015300000000000003, | |
0.00015083333333333339, | |
0.00014866666666666668, | |
0.00014650000000000004, | |
0.0001443333333333334, | |
0.0001421666666666667, | |
0.00014000000000000004, | |
0.0001378333333333334, | |
0.0001356666666666667, | |
0.00013350000000000005, | |
0.00013133333333333335, | |
0.0001291666666666667, | |
0.00012700000000000005, | |
0.00012483333333333335, | |
0.0001226666666666667, | |
0.00012050000000000006, | |
0.00011833333333333335, | |
0.00011616666666666671, | |
0.00011400000000000006, | |
0.00011183333333333336, | |
0.00010966666666666671, | |
0.00010750000000000001, | |
0.00010533333333333336, | |
0.00010316666666666672, | |
0.00010100000000000002, | |
9.883333333333337e-05, | |
9.666666666666672e-05, | |
9.450000000000002e-05, | |
9.233333333333337e-05, | |
9.016666666666673e-05, | |
8.800000000000002e-05, | |
8.583333333333338e-05, | |
8.366666666666673e-05, | |
8.150000000000003e-05, | |
7.933333333333338e-05, | |
7.716666666666668e-05, | |
7.500000000000003e-05, | |
7.283333333333339e-05, | |
7.066666666666669e-05, | |
6.850000000000004e-05, | |
6.633333333333339e-05, | |
6.416666666666669e-05, | |
6.200000000000004e-05, | |
5.9833333333333396e-05, | |
5.7666666666666695e-05, | |
5.550000000000005e-05, | |
5.333333333333335e-05, | |
5.11666666666667e-05, | |
4.900000000000005e-05, | |
4.683333333333335e-05, | |
4.4666666666666704e-05, | |
4.250000000000006e-05, | |
4.0333333333333356e-05, | |
3.816666666666671e-05, | |
3.600000000000006e-05, | |
3.383333333333336e-05, | |
3.1666666666666714e-05, | |
2.9500000000000067e-05, | |
2.7333333333333365e-05, | |
2.5166666666666718e-05, | |
2.3000000000000017e-05, | |
2.083333333333337e-05, | |
1.8666666666666723e-05, | |
1.650000000000002e-05, | |
1.4333333333333375e-05, | |
1.2166666666666727e-05, | |
1e-05, | |
] | |
log = ["./returnn.log"] | |
log_batch_size = True | |
log_verbosity = 4 | |
max_seqs = 128 | |
min_learning_rate = 1e-05 | |
model = "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/training/ReturnnTrainingJob.wQIA3Rc1nLak/output/models/epoch" | |
network = { | |
"specaug": { | |
"class": "eval", | |
"from": "features", | |
"eval": 'self.network.get_config().typed_value("transform")(source(0, as_data=True), max_time_num=1, max_time=15, max_feature_num=5, ' | |
"max_feature=8, network=self.network)", | |
}, | |
"conv_source": { | |
"class": "split_dims", | |
"from": ["specaug"], | |
"axis": "F", | |
"dims": (-1, 1), | |
}, | |
"conv_1": { | |
"class": "conv", | |
"from": "conv_source", | |
"n_out": 32, | |
"filter_size": (3, 3), | |
"padding": "same", | |
"with_bias": True, | |
"activation": "swish", | |
"L2": 0.01, | |
}, | |
"conv_1_pool": { | |
"class": "pool", | |
"mode": "max", | |
"padding": "same", | |
"pool_size": (1, 2), | |
"from": "conv_1", | |
"trainable": False, | |
}, | |
"conv_merged": {"class": "merge_dims", "from": "conv_3", "axes": "static"}, | |
"conv_2": { | |
"class": "conv", | |
"from": "conv_1_pool", | |
"n_out": 64, | |
"filter_size": (3, 3), | |
"padding": "same", | |
"with_bias": True, | |
"activation": "swish", | |
"L2": 0.01, | |
"strides": (2, 1), | |
}, | |
"conv_3": { | |
"class": "conv", | |
"from": "conv_2", | |
"n_out": 64, | |
"filter_size": (3, 3), | |
"padding": "same", | |
"with_bias": True, | |
"activation": "swish", | |
"L2": 0.01, | |
"strides": (2, 1), | |
}, | |
"input_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conv_merged", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"input_dropout": {"class": "copy", "from": "input_linear", "dropout": 0.1}, | |
"conformer_1_ffmod_1_ln": {"class": "layer_norm", "from": "input_dropout"}, | |
"conformer_1_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_1_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_1_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_1_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_1_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_1_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_1_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_1_ffmod_1_dropout", "input_dropout"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_1_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_1_ffmod_1_half_res_add", | |
}, | |
"conformer_1_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_1_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_1_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_1_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_1_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_1_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_1_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_1_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_1_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_1_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_1_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_1_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_1_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_1_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_1_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_1_conv_mod_dropout", "conformer_1_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_1_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_1_conv_mod_res_add", | |
}, | |
"conformer_1_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_1_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_1_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_1_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_1_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_1_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_1_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_1_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_1_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_1_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_1_mhsa_mod_dropout", "conformer_1_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_1_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_1_mhsa_mod_res_add", | |
}, | |
"conformer_1_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_1_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_1_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_1_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_1_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_1_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_1_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_1_ffmod_2_dropout", "conformer_1_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_1_output": { | |
"class": "layer_norm", | |
"from": "conformer_1_ffmod_2_half_res_add", | |
}, | |
"conformer_2_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_1_output"}, | |
"conformer_2_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_2_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_2_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_2_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_2_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_2_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_2_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_2_ffmod_1_dropout", "conformer_1_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_2_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_2_ffmod_1_half_res_add", | |
}, | |
"conformer_2_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_2_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_2_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_2_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_2_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_2_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_2_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_2_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_2_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_2_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_2_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_2_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_2_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_2_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_2_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_2_conv_mod_dropout", "conformer_2_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_2_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_2_conv_mod_res_add", | |
}, | |
"conformer_2_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_2_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_2_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_2_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_2_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_2_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_2_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_2_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_2_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_2_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_2_mhsa_mod_dropout", "conformer_2_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_2_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_2_mhsa_mod_res_add", | |
}, | |
"conformer_2_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_2_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_2_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_2_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_2_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_2_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_2_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_2_ffmod_2_dropout", "conformer_2_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_2_output": { | |
"class": "layer_norm", | |
"from": "conformer_2_ffmod_2_half_res_add", | |
}, | |
"conformer_3_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_2_output"}, | |
"conformer_3_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_3_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_3_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_3_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_3_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_3_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_3_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_3_ffmod_1_dropout", "conformer_2_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_3_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_3_ffmod_1_half_res_add", | |
}, | |
"conformer_3_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_3_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_3_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_3_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_3_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_3_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_3_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_3_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_3_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_3_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_3_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_3_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_3_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_3_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_3_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_3_conv_mod_dropout", "conformer_3_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_3_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_3_conv_mod_res_add", | |
}, | |
"conformer_3_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_3_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_3_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_3_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_3_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_3_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_3_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_3_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_3_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_3_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_3_mhsa_mod_dropout", "conformer_3_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_3_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_3_mhsa_mod_res_add", | |
}, | |
"conformer_3_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_3_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_3_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_3_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_3_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_3_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_3_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_3_ffmod_2_dropout", "conformer_3_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_3_output": { | |
"class": "layer_norm", | |
"from": "conformer_3_ffmod_2_half_res_add", | |
}, | |
"conformer_4_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_3_output"}, | |
"conformer_4_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_4_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_4_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_4_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_4_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_4_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_4_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_4_ffmod_1_dropout", "conformer_3_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_4_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_4_ffmod_1_half_res_add", | |
}, | |
"conformer_4_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_4_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_4_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_4_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_4_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_4_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_4_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_4_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_4_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_4_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_4_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_4_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_4_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_4_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_4_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_4_conv_mod_dropout", "conformer_4_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_4_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_4_conv_mod_res_add", | |
}, | |
"conformer_4_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_4_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_4_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_4_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_4_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_4_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_4_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_4_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_4_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_4_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_4_mhsa_mod_dropout", "conformer_4_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_4_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_4_mhsa_mod_res_add", | |
}, | |
"conformer_4_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_4_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_4_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_4_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_4_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_4_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_4_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_4_ffmod_2_dropout", "conformer_4_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_4_output": { | |
"class": "layer_norm", | |
"from": "conformer_4_ffmod_2_half_res_add", | |
}, | |
"conformer_5_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_4_output"}, | |
"conformer_5_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_5_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_5_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_5_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_5_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_5_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_5_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_5_ffmod_1_dropout", "conformer_4_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_5_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_5_ffmod_1_half_res_add", | |
}, | |
"conformer_5_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_5_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_5_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_5_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_5_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_5_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_5_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_5_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_5_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_5_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_5_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_5_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_5_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_5_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_5_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_5_conv_mod_dropout", "conformer_5_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_5_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_5_conv_mod_res_add", | |
}, | |
"conformer_5_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_5_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_5_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_5_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_5_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_5_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_5_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_5_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_5_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_5_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_5_mhsa_mod_dropout", "conformer_5_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_5_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_5_mhsa_mod_res_add", | |
}, | |
"conformer_5_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_5_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_5_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_5_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_5_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_5_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_5_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_5_ffmod_2_dropout", "conformer_5_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_5_output": { | |
"class": "layer_norm", | |
"from": "conformer_5_ffmod_2_half_res_add", | |
}, | |
"conformer_6_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_5_output"}, | |
"conformer_6_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_6_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_6_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_6_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_6_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_6_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_6_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_6_ffmod_1_dropout", "conformer_5_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_6_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_6_ffmod_1_half_res_add", | |
}, | |
"conformer_6_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_6_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_6_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_6_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_6_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_6_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_6_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_6_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_6_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_6_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_6_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_6_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_6_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_6_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_6_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_6_conv_mod_dropout", "conformer_6_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_6_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_6_conv_mod_res_add", | |
}, | |
"conformer_6_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_6_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_6_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_6_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_6_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_6_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_6_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_6_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_6_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_6_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_6_mhsa_mod_dropout", "conformer_6_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_6_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_6_mhsa_mod_res_add", | |
}, | |
"conformer_6_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_6_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_6_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_6_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_6_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_6_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_6_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_6_ffmod_2_dropout", "conformer_6_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_6_output": { | |
"class": "layer_norm", | |
"from": "conformer_6_ffmod_2_half_res_add", | |
}, | |
"conformer_7_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_6_output"}, | |
"conformer_7_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_7_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_7_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_7_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_7_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_7_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_7_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_7_ffmod_1_dropout", "conformer_6_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_7_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_7_ffmod_1_half_res_add", | |
}, | |
"conformer_7_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_7_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_7_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_7_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_7_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_7_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_7_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_7_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_7_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_7_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_7_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_7_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_7_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_7_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_7_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_7_conv_mod_dropout", "conformer_7_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_7_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_7_conv_mod_res_add", | |
}, | |
"conformer_7_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_7_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_7_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_7_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_7_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_7_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_7_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_7_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_7_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_7_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_7_mhsa_mod_dropout", "conformer_7_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_7_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_7_mhsa_mod_res_add", | |
}, | |
"conformer_7_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_7_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_7_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_7_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_7_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_7_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_7_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_7_ffmod_2_dropout", "conformer_7_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_7_output": { | |
"class": "layer_norm", | |
"from": "conformer_7_ffmod_2_half_res_add", | |
}, | |
"conformer_8_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_7_output"}, | |
"conformer_8_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_8_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_8_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_8_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_8_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_8_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_8_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_8_ffmod_1_dropout", "conformer_7_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_8_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_8_ffmod_1_half_res_add", | |
}, | |
"conformer_8_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_8_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_8_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_8_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_8_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_8_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_8_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_8_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_8_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_8_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_8_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_8_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_8_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_8_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_8_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_8_conv_mod_dropout", "conformer_8_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_8_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_8_conv_mod_res_add", | |
}, | |
"conformer_8_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_8_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_8_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_8_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_8_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_8_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_8_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_8_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_8_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_8_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_8_mhsa_mod_dropout", "conformer_8_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_8_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_8_mhsa_mod_res_add", | |
}, | |
"conformer_8_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_8_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_8_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_8_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_8_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_8_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_8_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_8_ffmod_2_dropout", "conformer_8_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_8_output": { | |
"class": "layer_norm", | |
"from": "conformer_8_ffmod_2_half_res_add", | |
}, | |
"conformer_9_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_8_output"}, | |
"conformer_9_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_9_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_9_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_9_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_9_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_9_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_9_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_9_ffmod_1_dropout", "conformer_8_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_9_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_9_ffmod_1_half_res_add", | |
}, | |
"conformer_9_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_9_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_9_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_9_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_9_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_9_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_9_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_9_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_9_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_9_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_9_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_9_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_9_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_9_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_9_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_9_conv_mod_dropout", "conformer_9_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_9_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_9_conv_mod_res_add", | |
}, | |
"conformer_9_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_9_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_9_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_9_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_9_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_9_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_9_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_9_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_9_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_9_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_9_mhsa_mod_dropout", "conformer_9_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_9_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_9_mhsa_mod_res_add", | |
}, | |
"conformer_9_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_9_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_9_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_9_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_9_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_9_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_9_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_9_ffmod_2_dropout", "conformer_9_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_9_output": { | |
"class": "layer_norm", | |
"from": "conformer_9_ffmod_2_half_res_add", | |
}, | |
"conformer_10_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_9_output"}, | |
"conformer_10_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_10_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_10_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_10_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_10_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_10_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_10_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_10_ffmod_1_dropout", "conformer_9_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_10_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_10_ffmod_1_half_res_add", | |
}, | |
"conformer_10_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_10_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_10_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_10_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_10_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_10_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_10_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_10_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_10_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_10_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_10_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_10_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_10_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_10_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_10_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_10_conv_mod_dropout", "conformer_10_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_10_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_10_conv_mod_res_add", | |
}, | |
"conformer_10_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_10_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_10_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_10_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_10_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_10_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_10_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_10_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_10_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_10_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_10_mhsa_mod_dropout", "conformer_10_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_10_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_10_mhsa_mod_res_add", | |
}, | |
"conformer_10_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_10_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_10_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_10_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_10_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_10_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_10_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_10_ffmod_2_dropout", "conformer_10_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_10_output": { | |
"class": "layer_norm", | |
"from": "conformer_10_ffmod_2_half_res_add", | |
}, | |
"conformer_11_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_10_output"}, | |
"conformer_11_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_11_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_11_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_11_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_11_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_11_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_11_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_11_ffmod_1_dropout", "conformer_10_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_11_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_11_ffmod_1_half_res_add", | |
}, | |
"conformer_11_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_11_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_11_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_11_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_11_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_11_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_11_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_11_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_11_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_11_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_11_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_11_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_11_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_11_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_11_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_11_conv_mod_dropout", "conformer_11_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_11_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_11_conv_mod_res_add", | |
}, | |
"conformer_11_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_11_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_11_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_11_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_11_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_11_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_11_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_11_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_11_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_11_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_11_mhsa_mod_dropout", "conformer_11_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_11_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_11_mhsa_mod_res_add", | |
}, | |
"conformer_11_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_11_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_11_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_11_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_11_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_11_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_11_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_11_ffmod_2_dropout", "conformer_11_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_11_output": { | |
"class": "layer_norm", | |
"from": "conformer_11_ffmod_2_half_res_add", | |
}, | |
"conformer_12_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_11_output"}, | |
"conformer_12_ffmod_1_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_12_ffmod_1_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_12_ffmod_1_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_12_ffmod_1_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_12_ffmod_1_dropout": { | |
"class": "copy", | |
"from": "conformer_12_ffmod_1_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_12_ffmod_1_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_12_ffmod_1_dropout", "conformer_11_output"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_12_conv_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_12_ffmod_1_half_res_add", | |
}, | |
"conformer_12_conv_mod_pointwise_conv_1": { | |
"class": "linear", | |
"n_out": 1024, | |
"from": "conformer_12_conv_mod_ln", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_12_conv_mod_glu": { | |
"class": "gating", | |
"from": "conformer_12_conv_mod_pointwise_conv_1", | |
"activation": None, | |
"gate_activation": "sigmoid", | |
}, | |
"conformer_12_conv_mod_depthwise_conv": { | |
"class": "conv", | |
"from": "conformer_12_conv_mod_glu", | |
"n_out": 512, | |
"filter_size": (32,), | |
"padding": "same", | |
"with_bias": True, | |
"activation": None, | |
"L2": 0.0001, | |
"groups": 512, | |
}, | |
"conformer_12_conv_mod_bn": { | |
"class": "batch_norm", | |
"from": "conformer_12_conv_mod_depthwise_conv", | |
"momentum": 0.1, | |
"epsilon": 1e-05, | |
"update_sample_only_in_training": True, | |
"delay_sample_update": True, | |
}, | |
"conformer_12_conv_mod_swish": { | |
"class": "activation", | |
"from": "conformer_12_conv_mod_bn", | |
"activation": "swish", | |
}, | |
"conformer_12_conv_mod_pointwise_conv_2": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_12_conv_mod_swish", | |
"activation": None, | |
"L2": 0.0001, | |
}, | |
"conformer_12_conv_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_12_conv_mod_pointwise_conv_2", | |
"dropout": 0.1, | |
}, | |
"conformer_12_conv_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_12_conv_mod_dropout", "conformer_12_ffmod_1_half_res_add"], | |
"kind": "add", | |
}, | |
"conformer_12_mhsa_mod_ln": { | |
"class": "layer_norm", | |
"from": "conformer_12_conv_mod_res_add", | |
}, | |
"conformer_12_mhsa_mod_relpos_encoding": { | |
"class": "relative_positional_encoding", | |
"from": "conformer_12_mhsa_mod_ln", | |
"n_out": 64, | |
"clipping": 32, | |
}, | |
"conformer_12_mhsa_mod_self_attention": { | |
"class": "self_attention", | |
"from": "conformer_12_mhsa_mod_ln", | |
"n_out": 512, | |
"num_heads": 8, | |
"total_key_dim": 512, | |
"key_shift": "conformer_12_mhsa_mod_relpos_encoding", | |
"attention_dropout": 0.1, | |
}, | |
"conformer_12_mhsa_mod_att_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_12_mhsa_mod_self_attention", | |
"activation": None, | |
"L2": 0.0001, | |
"with_bias": False, | |
}, | |
"conformer_12_mhsa_mod_dropout": { | |
"class": "copy", | |
"from": "conformer_12_mhsa_mod_att_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_12_mhsa_mod_res_add": { | |
"class": "combine", | |
"from": ["conformer_12_mhsa_mod_dropout", "conformer_12_conv_mod_res_add"], | |
"kind": "add", | |
}, | |
"conformer_12_ffmod_2_ln": { | |
"class": "layer_norm", | |
"from": "conformer_12_mhsa_mod_res_add", | |
}, | |
"conformer_12_ffmod_2_linear_swish": { | |
"class": "linear", | |
"n_out": 2048, | |
"from": "conformer_12_ffmod_2_ln", | |
"activation": "swish", | |
"L2": 0.0001, | |
}, | |
"conformer_12_ffmod_2_dropout_linear": { | |
"class": "linear", | |
"n_out": 512, | |
"from": "conformer_12_ffmod_2_linear_swish", | |
"activation": None, | |
"L2": 0.0001, | |
"dropout": 0.1, | |
}, | |
"conformer_12_ffmod_2_dropout": { | |
"class": "copy", | |
"from": "conformer_12_ffmod_2_dropout_linear", | |
"dropout": 0.1, | |
}, | |
"conformer_12_ffmod_2_half_res_add": { | |
"class": "eval", | |
"from": ["conformer_12_ffmod_2_dropout", "conformer_12_mhsa_mod_res_add"], | |
"eval": "0.5 * source(0) + source(1)", | |
}, | |
"conformer_12_output": { | |
"class": "layer_norm", | |
"from": "conformer_12_ffmod_2_half_res_add", | |
}, | |
"encoder": {"class": "copy", "from": "conformer_12_output"}, | |
"output": { | |
"class": "softmax", | |
"from": "encoder", | |
"loss": "fast_bw", | |
"loss_opts": { | |
"sprint_opts": { | |
"sprintExecPath": "/work/asr4/vieting/programs/rasr/20230707/rasr/arch/linux-x86_64-standard/nn-trainer.linux-x86_64-standard", | |
"sprintConfigStr": "--*.configuration.channel=output-channel " | |
"--*.real-time-factor.channel=output-channel --*.system-info.channel=output-channel " | |
"--*.time.channel=output-channel --*.version.channel=output-channel " | |
"--*.log.channel=output-channel --*.warning.channel=output-channel, stderr " | |
"--*.error.channel=output-channel, stderr --*.statistics.channel=output-channel " | |
"--*.progress.channel=output-channel --*.dot.channel=nil " | |
"--*.corpus.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_core/datasets/switchboard/CreateSwitchboardBlissCorpusJob.Z1EMi4TdrUS6/output/swb.corpus.xml.gz " | |
"--*.corpus.segments.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.nrKcBIdsMBZm/output/segments.1 " | |
"--*.model-combination.lexicon.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_experiments/users/berger/recipe/lexicon/modification/MakeBlankLexiconJob.N8RlHYKzilei/output/lexicon.xml " | |
"--*.model-combination.acoustic-model.state-tying.type=lookup " | |
"--*.model-combination.acoustic-model.state-tying.file=/u/vieting/setups/swb/20230406_feat/dependencies/state-tying_blank " | |
"--*.model-combination.acoustic-model.allophones.add-from-lexicon=no " | |
"--*.model-combination.acoustic-model.allophones.add-all=yes " | |
"--*.model-combination.acoustic-model.allophones.add-from-file=/u/vieting/setups/swb/20230406_feat/dependencies/allophones_blank " | |
"--*.model-combination.acoustic-model.hmm.states-per-phone=1 " | |
"--*.model-combination.acoustic-model.hmm.state-repetitions=1 " | |
"--*.model-combination.acoustic-model.hmm.across-word-model=yes " | |
"--*.model-combination.acoustic-model.hmm.early-recombination=no " | |
"--*.model-combination.acoustic-model.tdp.scale=1.0 " | |
"--*.model-combination.acoustic-model.tdp.*.loop=0.0 " | |
"--*.model-combination.acoustic-model.tdp.*.forward=0.0 " | |
"--*.model-combination.acoustic-model.tdp.*.skip=infinity " | |
"--*.model-combination.acoustic-model.tdp.*.exit=0.0 " | |
"--*.model-combination.acoustic-model.tdp.silence.loop=0.0 " | |
"--*.model-combination.acoustic-model.tdp.silence.forward=0.0 " | |
"--*.model-combination.acoustic-model.tdp.silence.skip=infinity " | |
"--*.model-combination.acoustic-model.tdp.silence.exit=0.0 " | |
"--*.model-combination.acoustic-model.tdp.entry-m1.loop=infinity " | |
"--*.model-combination.acoustic-model.tdp.entry-m2.loop=infinity " | |
"--*.model-combination.acoustic-model.phonology.history-length=0 " | |
"--*.model-combination.acoustic-model.phonology.future-length=0 " | |
"--*.transducer-builder-filter-out-invalid-allophones=yes " | |
"--*.fix-allophone-context-at-word-boundaries=yes " | |
"--*.allophone-state-graph-builder.topology=ctc " | |
"--*.allow-for-silence-repetitions=no --action=python-control " | |
"--python-control-loop-type=python-control-loop --extract-features=no " | |
"--*.encoding=UTF-8 --*.output-channel.file=$(LOGFILE) " | |
"--*.output-channel.compressed=no --*.output-channel.append=no " | |
"--*.output-channel.unbuffered=no --*.LOGFILE=nn-trainer.loss.log --*.TASK=1", | |
"minPythonControlVersion": 4, | |
"numInstances": 2, | |
"usePythonSegmentOrder": False, | |
}, | |
"tdp_scale": 0.0, | |
}, | |
"target": None, | |
"n_out": 88, | |
}, | |
"features": { | |
"class": "subnetwork", | |
"from": ["data"], | |
"subnetwork": { | |
"conv_h_filter": { | |
"class": "variable", | |
"shape": (128, 1, 150), | |
"init": "glorot_uniform", | |
}, | |
"conv_h": { | |
"class": "conv", | |
"filter_size": (128,), | |
"strides": 5, | |
"n_out": 150, | |
"padding": "valid", | |
"filter": "conv_h_filter", | |
"from": "data", | |
}, | |
"conv_h_split": { | |
"class": "split_dims", | |
"axis": "F", | |
"dims": (-1, 1), | |
"from": "conv_h_act", | |
}, | |
"conv_l": { | |
"class": "conv", | |
"filter_size": (40, 1), | |
"strides": (16, 1), | |
"n_out": 5, | |
"padding": "valid", | |
"from": "conv_h_split", | |
}, | |
"conv_l_merge": { | |
"class": "merge_dims", | |
"axes": "except_time", | |
"from": "conv_l", | |
}, | |
"output": {"class": "copy", "from": "conv_l_act"}, | |
"conv_h_act": { | |
"class": "eval", | |
"eval": "tf.abs(source(0))", | |
"from": "conv_h", | |
}, | |
"conv_l_act": {"class": "layer_norm", "from": ["conv_l_act_no_norm"]}, | |
"conv_l_act_no_norm": { | |
"class": "eval", | |
"eval": "tf.pow(tf.abs(source(0)) + 1e-05, 1 / 2.5)", | |
"from": ["conv_l_merge"], | |
}, | |
}, | |
"trainable": True, | |
}, | |
} | |
newbob_learning_rate_decay = 0.9 | |
newbob_multi_num_epochs = 6 | |
newbob_multi_update_interval = 1 | |
num_epochs = 450 | |
optimizer = {"class": "nadam", "epsilon": 1e-08} | |
save_interval = 1 | |
target = "classes" | |
task = "train" | |
tf_log_memory_usage = True | |
train = { | |
"class": "MultiProcDataset", | |
"dataset": { | |
"class": "OggZipDataset", | |
"audio": {"features": "raw", "peak_normalization": True}, | |
"partition_epoch": 6, | |
"path": [ | |
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip" | |
], | |
"seq_ordering": "laplace:.384", | |
"use_cache_manager": True, | |
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.SVlbt6fqP4Jn/output/segments.1", | |
"targets": None, | |
}, | |
"num_workers": 2, | |
"buffer_size": 5, | |
} | |
use_tensorflow = True | |
watch_memory = True | |
config = {} | |
locals().update(**config) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment