Skip to content

Instantly share code, notes, and snippets.

@Max-Ryujin
Created October 31, 2023 09:51
Show Gist options
  • Save Max-Ryujin/f7323769568c9803bd19e3eba623c2d1 to your computer and use it in GitHub Desktop.
Save Max-Ryujin/f7323769568c9803bd19e3eba623c2d1 to your computer and use it in GitHub Desktop.
returnn.config for the setup /u/maximilian.kannen/setups/20230406_feat/recipe/i6_experiments/users/vieting/experiments/switchboard/ctc/feat/experiments.run_scf_baseline_big
#!rnn.py
import sys
sys.setrecursionlimit(3000)
def _mask(x, batch_axis, axis, pos, max_amount):
"""
:param tf.Tensor x: (batch,time,feature)
:param int batch_axis:
:param int axis:
:param tf.Tensor pos: (batch,)
:param int|tf.Tensor max_amount: inclusive
"""
import tensorflow as tf
ndim = x.get_shape().ndims
n_batch = tf.shape(x)[batch_axis]
dim = tf.shape(x)[axis]
amount = tf.random.uniform(
shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32
)
pos2 = tf.math.minimum(pos + amount, dim)
idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim)
pos_bc = tf.expand_dims(pos, 1) # (batch,1)
pos2_bc = tf.expand_dims(pos2, 1) # (batch,1)
cond = tf.math.logical_and(
tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)
) # (batch,dim)
if batch_axis > axis:
cond = tf.transpose(cond) # (dim,batch)
cond = tf.reshape(
cond, [tf.shape(x)[i] if i in (batch_axis, axis) else 1 for i in range(ndim)]
)
from TFUtil import where_bc
x = where_bc(cond, 0.0, x)
return x
def random_mask(x, batch_axis, axis, min_num, max_num, max_dims):
"""
:param tf.Tensor x: (batch,time,feature)
:param int batch_axis:
:param int axis:
:param int|tf.Tensor min_num:
:param int|tf.Tensor max_num: inclusive
:param int|tf.Tensor max_dims: inclusive
"""
import tensorflow as tf
n_batch = tf.shape(x)[batch_axis]
if isinstance(min_num, int) and isinstance(max_num, int) and min_num == max_num:
num = min_num
else:
num = tf.random.uniform(
shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32
)
# https://github.com/tensorflow/tensorflow/issues/9260
# https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/
z = -tf.math.log(
-tf.math.log(tf.random.uniform((n_batch, tf.shape(x)[axis]), 0, 1))
)
_, indices = tf.math.top_k(z, num if isinstance(num, int) else tf.reduce_max(num))
# indices should be sorted, and of shape (batch,num), entries (int32) in [0,dim)
# indices = tf.Print(indices, ["indices", indices, tf.shape(indices)])
if isinstance(num, int):
for i in range(num):
x = _mask(
x,
batch_axis=batch_axis,
axis=axis,
pos=indices[:, i],
max_amount=max_dims,
)
else:
_, x = tf.while_loop(
cond=lambda i, _: tf.less(i, tf.reduce_max(num)),
body=lambda i, x: (
i + 1,
tf.where(
tf.expand_dims(tf.expand_dims(tf.less(i, num), axis=-1), axis=-1),
_mask(
x,
batch_axis=batch_axis,
axis=axis,
pos=indices[:, i],
max_amount=max_dims,
),
x,
),
),
loop_vars=(0, x),
)
return x
def transform(data, max_time_num, max_time, max_feature_num, max_feature, network):
# halved before this step
conservative_step = 2000
x = data.placeholder
import tensorflow as tf
step = network.global_train_step
increase_flag = tf.where(tf.greater_equal(step, conservative_step), 0, 1)
def get_masked():
x_masked = x
x_masked = random_mask(
x_masked,
batch_axis=data.batch_dim_axis,
axis=data.time_dim_axis,
min_num=0,
max_num=tf.maximum(
tf.shape(x)[data.time_dim_axis] // int(1.0 / 0.7 * max_time),
max_time_num,
)
// (1 + increase_flag),
max_dims=max_time,
)
x_masked = random_mask(
x_masked,
batch_axis=data.batch_dim_axis,
axis=data.feature_dim_axis,
min_num=0,
max_num=max_feature_num // (1 + increase_flag),
max_dims=max_feature,
)
return x_masked
x = network.cond_on_train(get_masked, lambda: x)
return x
batch_size = {"classes": 5000, "data": 400000}
cache_size = "0"
cleanup_old_models = {"keep_last_n": 5, "keep_best_n": 5, "keep": [450]}
debug_print_layer_output_template = True
dev = {
"class": "OggZipDataset",
"audio": {"features": "raw", "peak_normalization": True},
"partition_epoch": 1,
"path": [
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip"
],
"seq_ordering": "sorted_reverse",
"use_cache_manager": True,
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.Fzh6DWEkIA5y/output/segments.1",
"targets": None,
}
device = "gpu"
eval_datasets = {
"devtrain": {
"class": "OggZipDataset",
"audio": {"features": "raw", "peak_normalization": True},
"partition_epoch": 1,
"path": [
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip"
],
"seq_ordering": "sorted_reverse",
"use_cache_manager": True,
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/text/processing/TailJob.RiSM6fe2XipO/output/out.gz",
"targets": None,
}
}
extern_data = {"data": {"dim": 1}}
gradient_noise = 0.0
learning_rate_control = "newbob_multi_epoch"
learning_rate_control_min_num_epochs_per_new_lr = 3
learning_rate_control_relative_error_relative_lr = True
learning_rate_file = "learning_rates"
learning_rates = [
1.325e-05,
1.539861111111111e-05,
1.754722222222222e-05,
1.9695833333333335e-05,
2.1844444444444446e-05,
2.3993055555555557e-05,
2.6141666666666667e-05,
2.8290277777777778e-05,
3.043888888888889e-05,
3.25875e-05,
3.473611111111111e-05,
3.688472222222222e-05,
3.903333333333334e-05,
4.118194444444444e-05,
4.333055555555556e-05,
4.547916666666666e-05,
4.762777777777778e-05,
4.9776388888888884e-05,
5.1925e-05,
5.4073611111111106e-05,
5.622222222222222e-05,
5.837083333333333e-05,
6.0519444444444444e-05,
6.266805555555555e-05,
6.481666666666667e-05,
6.696527777777778e-05,
6.911388888888889e-05,
7.12625e-05,
7.341111111111111e-05,
7.555972222222221e-05,
7.770833333333333e-05,
7.985694444444443e-05,
8.200555555555555e-05,
8.415416666666667e-05,
8.630277777777777e-05,
8.845138888888889e-05,
9.059999999999999e-05,
9.274861111111111e-05,
9.489722222222221e-05,
9.704583333333333e-05,
9.919444444444444e-05,
0.00010134305555555555,
0.00010349166666666666,
0.00010564027777777777,
0.00010778888888888888,
0.0001099375,
0.0001120861111111111,
0.00011423472222222222,
0.00011638333333333333,
0.00011853194444444444,
0.00012068055555555556,
0.00012282916666666666,
0.00012497777777777778,
0.0001271263888888889,
0.000129275,
0.0001314236111111111,
0.00013357222222222222,
0.00013572083333333334,
0.00013786944444444443,
0.00014001805555555554,
0.00014216666666666666,
0.00014431527777777778,
0.00014646388888888887,
0.0001486125,
0.0001507611111111111,
0.00015290972222222222,
0.00015505833333333334,
0.00015720694444444443,
0.00015935555555555555,
0.00016150416666666666,
0.00016365277777777778,
0.00016580138888888887,
0.00016795,
0.0001700986111111111,
0.00017224722222222222,
0.00017439583333333331,
0.00017654444444444443,
0.00017869305555555555,
0.00018084166666666667,
0.00018299027777777776,
0.00018513888888888887,
0.0001872875,
0.0001894361111111111,
0.00019158472222222223,
0.00019373333333333332,
0.00019588194444444443,
0.00019803055555555555,
0.00020017916666666667,
0.00020232777777777776,
0.00020447638888888888,
0.000206625,
0.0002087736111111111,
0.0002109222222222222,
0.00021307083333333332,
0.00021521944444444444,
0.00021736805555555555,
0.00021951666666666667,
0.00022166527777777776,
0.00022381388888888888,
0.0002259625,
0.00022811111111111111,
0.0002302597222222222,
0.00023240833333333332,
0.00023455694444444444,
0.00023670555555555556,
0.00023885416666666665,
0.00024100277777777776,
0.00024315138888888888,
0.0002453,
0.0002474486111111111,
0.00024959722222222223,
0.0002517458333333333,
0.00025389444444444447,
0.00025604305555555556,
0.0002581916666666667,
0.0002603402777777778,
0.0002624888888888889,
0.00026463750000000003,
0.0002667861111111111,
0.00026893472222222226,
0.00027108333333333335,
0.00027323194444444444,
0.0002753805555555556,
0.0002775291666666667,
0.00027967777777777777,
0.0002818263888888889,
0.000283975,
0.00028612361111111115,
0.00028827222222222224,
0.00029042083333333333,
0.0002925694444444445,
0.00029471805555555556,
0.0002968666666666667,
0.0002990152777777778,
0.0003011638888888889,
0.00030331250000000003,
0.0003054611111111111,
0.0003076097222222222,
0.00030975833333333336,
0.00031190694444444445,
0.0003140555555555556,
0.0003162041666666667,
0.0003183527777777778,
0.0003205013888888889,
0.00032265,
0.00032479861111111115,
0.00032694722222222224,
0.00032909583333333333,
0.0003312444444444445,
0.00033339305555555557,
0.00033554166666666666,
0.0003376902777777778,
0.0003398388888888889,
0.00034198750000000004,
0.00034413611111111113,
0.0003462847222222222,
0.00034843333333333336,
0.00035058194444444445,
0.00035273055555555554,
0.0003548791666666667,
0.0003570277777777778,
0.0003591763888888889,
0.000361325,
0.0003634736111111111,
0.00036562222222222225,
0.00036777083333333334,
0.0003699194444444445,
0.0003720680555555556,
0.00037421666666666666,
0.0003763652777777778,
0.0003785138888888889,
0.0003806625,
0.00038281111111111113,
0.0003849597222222222,
0.00038710833333333337,
0.00038925694444444446,
0.00039140555555555555,
0.0003935541666666667,
0.0003957027777777778,
0.00039785138888888893,
0.0004,
0.0004,
0.00039783333333333337,
0.00039566666666666667,
0.0003935,
0.0003913333333333334,
0.0003891666666666667,
0.00038700000000000003,
0.0003848333333333334,
0.0003826666666666667,
0.00038050000000000003,
0.00037833333333333333,
0.0003761666666666667,
0.00037400000000000004,
0.00037183333333333334,
0.0003696666666666667,
0.00036750000000000004,
0.00036533333333333334,
0.0003631666666666667,
0.000361,
0.00035883333333333335,
0.0003566666666666667,
0.0003545,
0.00035233333333333335,
0.0003501666666666667,
0.000348,
0.00034583333333333335,
0.0003436666666666667,
0.0003415,
0.00033933333333333336,
0.0003371666666666667,
0.000335,
0.00033283333333333336,
0.0003306666666666667,
0.0003285,
0.00032633333333333337,
0.0003241666666666667,
0.000322,
0.0003198333333333334,
0.00031766666666666667,
0.0003155,
0.0003133333333333333,
0.0003111666666666667,
0.00030900000000000003,
0.00030683333333333333,
0.0003046666666666667,
0.00030250000000000003,
0.00030033333333333333,
0.0002981666666666667,
0.00029600000000000004,
0.00029383333333333334,
0.0002916666666666667,
0.00028950000000000004,
0.00028733333333333334,
0.0002851666666666667,
0.00028300000000000005,
0.00028083333333333335,
0.0002786666666666667,
0.00027650000000000005,
0.00027433333333333335,
0.0002721666666666667,
0.00027000000000000006,
0.00026783333333333336,
0.00026566666666666666,
0.00026350000000000006,
0.00026133333333333336,
0.00025916666666666666,
0.00025700000000000007,
0.00025483333333333337,
0.00025266666666666666,
0.0002505,
0.00024833333333333337,
0.00024616666666666667,
0.00024400000000000002,
0.00024183333333333337,
0.0002396666666666667,
0.00023750000000000003,
0.00023533333333333335,
0.0002331666666666667,
0.00023100000000000003,
0.00022883333333333336,
0.00022666666666666668,
0.00022450000000000004,
0.00022233333333333336,
0.0002201666666666667,
0.00021800000000000004,
0.00021583333333333337,
0.0002136666666666667,
0.00021150000000000002,
0.00020933333333333337,
0.0002071666666666667,
0.00020500000000000002,
0.00020283333333333338,
0.0002006666666666667,
0.00019850000000000003,
0.00019633333333333335,
0.0001941666666666667,
0.00019200000000000003,
0.00018983333333333336,
0.0001876666666666667,
0.00018550000000000004,
0.00018333333333333336,
0.0001811666666666667,
0.00017900000000000004,
0.00017683333333333337,
0.0001746666666666667,
0.00017250000000000005,
0.00017033333333333337,
0.0001681666666666667,
0.00016600000000000002,
0.00016383333333333338,
0.0001616666666666667,
0.00015950000000000003,
0.00015733333333333338,
0.00015516666666666668,
0.00015300000000000003,
0.00015083333333333339,
0.00014866666666666668,
0.00014650000000000004,
0.0001443333333333334,
0.0001421666666666667,
0.00014000000000000004,
0.0001378333333333334,
0.0001356666666666667,
0.00013350000000000005,
0.00013133333333333335,
0.0001291666666666667,
0.00012700000000000005,
0.00012483333333333335,
0.0001226666666666667,
0.00012050000000000006,
0.00011833333333333335,
0.00011616666666666671,
0.00011400000000000006,
0.00011183333333333336,
0.00010966666666666671,
0.00010750000000000001,
0.00010533333333333336,
0.00010316666666666672,
0.00010100000000000002,
9.883333333333337e-05,
9.666666666666672e-05,
9.450000000000002e-05,
9.233333333333337e-05,
9.016666666666673e-05,
8.800000000000002e-05,
8.583333333333338e-05,
8.366666666666673e-05,
8.150000000000003e-05,
7.933333333333338e-05,
7.716666666666668e-05,
7.500000000000003e-05,
7.283333333333339e-05,
7.066666666666669e-05,
6.850000000000004e-05,
6.633333333333339e-05,
6.416666666666669e-05,
6.200000000000004e-05,
5.9833333333333396e-05,
5.7666666666666695e-05,
5.550000000000005e-05,
5.333333333333335e-05,
5.11666666666667e-05,
4.900000000000005e-05,
4.683333333333335e-05,
4.4666666666666704e-05,
4.250000000000006e-05,
4.0333333333333356e-05,
3.816666666666671e-05,
3.600000000000006e-05,
3.383333333333336e-05,
3.1666666666666714e-05,
2.9500000000000067e-05,
2.7333333333333365e-05,
2.5166666666666718e-05,
2.3000000000000017e-05,
2.083333333333337e-05,
1.8666666666666723e-05,
1.650000000000002e-05,
1.4333333333333375e-05,
1.2166666666666727e-05,
1e-05,
]
log = ["./returnn.log"]
log_batch_size = True
log_verbosity = 4
max_seqs = 128
min_learning_rate = 1e-05
model = "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/training/ReturnnTrainingJob.wQIA3Rc1nLak/output/models/epoch"
network = {
"specaug": {
"class": "eval",
"from": "features",
"eval": 'self.network.get_config().typed_value("transform")(source(0, as_data=True), max_time_num=1, max_time=15, max_feature_num=5, '
"max_feature=8, network=self.network)",
},
"conv_source": {
"class": "split_dims",
"from": ["specaug"],
"axis": "F",
"dims": (-1, 1),
},
"conv_1": {
"class": "conv",
"from": "conv_source",
"n_out": 32,
"filter_size": (3, 3),
"padding": "same",
"with_bias": True,
"activation": "swish",
"L2": 0.01,
},
"conv_1_pool": {
"class": "pool",
"mode": "max",
"padding": "same",
"pool_size": (1, 2),
"from": "conv_1",
"trainable": False,
},
"conv_merged": {"class": "merge_dims", "from": "conv_3", "axes": "static"},
"conv_2": {
"class": "conv",
"from": "conv_1_pool",
"n_out": 64,
"filter_size": (3, 3),
"padding": "same",
"with_bias": True,
"activation": "swish",
"L2": 0.01,
"strides": (2, 1),
},
"conv_3": {
"class": "conv",
"from": "conv_2",
"n_out": 64,
"filter_size": (3, 3),
"padding": "same",
"with_bias": True,
"activation": "swish",
"L2": 0.01,
"strides": (2, 1),
},
"input_linear": {
"class": "linear",
"n_out": 512,
"from": "conv_merged",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"input_dropout": {"class": "copy", "from": "input_linear", "dropout": 0.1},
"conformer_1_ffmod_1_ln": {"class": "layer_norm", "from": "input_dropout"},
"conformer_1_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_1_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_1_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_1_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_1_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_1_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_1_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_1_ffmod_1_dropout", "input_dropout"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_1_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_1_ffmod_1_half_res_add",
},
"conformer_1_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_1_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_1_conv_mod_glu": {
"class": "gating",
"from": "conformer_1_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_1_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_1_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_1_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_1_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_1_conv_mod_swish": {
"class": "activation",
"from": "conformer_1_conv_mod_bn",
"activation": "swish",
},
"conformer_1_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_1_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_1_conv_mod_dropout": {
"class": "copy",
"from": "conformer_1_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_1_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_1_conv_mod_dropout", "conformer_1_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_1_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_1_conv_mod_res_add",
},
"conformer_1_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_1_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_1_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_1_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_1_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_1_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_1_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_1_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_1_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_1_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_1_mhsa_mod_dropout", "conformer_1_conv_mod_res_add"],
"kind": "add",
},
"conformer_1_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_1_mhsa_mod_res_add",
},
"conformer_1_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_1_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_1_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_1_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_1_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_1_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_1_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_1_ffmod_2_dropout", "conformer_1_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_1_output": {
"class": "layer_norm",
"from": "conformer_1_ffmod_2_half_res_add",
},
"conformer_2_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_1_output"},
"conformer_2_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_2_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_2_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_2_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_2_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_2_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_2_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_2_ffmod_1_dropout", "conformer_1_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_2_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_2_ffmod_1_half_res_add",
},
"conformer_2_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_2_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_2_conv_mod_glu": {
"class": "gating",
"from": "conformer_2_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_2_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_2_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_2_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_2_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_2_conv_mod_swish": {
"class": "activation",
"from": "conformer_2_conv_mod_bn",
"activation": "swish",
},
"conformer_2_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_2_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_2_conv_mod_dropout": {
"class": "copy",
"from": "conformer_2_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_2_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_2_conv_mod_dropout", "conformer_2_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_2_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_2_conv_mod_res_add",
},
"conformer_2_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_2_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_2_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_2_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_2_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_2_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_2_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_2_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_2_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_2_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_2_mhsa_mod_dropout", "conformer_2_conv_mod_res_add"],
"kind": "add",
},
"conformer_2_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_2_mhsa_mod_res_add",
},
"conformer_2_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_2_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_2_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_2_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_2_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_2_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_2_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_2_ffmod_2_dropout", "conformer_2_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_2_output": {
"class": "layer_norm",
"from": "conformer_2_ffmod_2_half_res_add",
},
"conformer_3_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_2_output"},
"conformer_3_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_3_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_3_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_3_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_3_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_3_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_3_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_3_ffmod_1_dropout", "conformer_2_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_3_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_3_ffmod_1_half_res_add",
},
"conformer_3_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_3_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_3_conv_mod_glu": {
"class": "gating",
"from": "conformer_3_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_3_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_3_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_3_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_3_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_3_conv_mod_swish": {
"class": "activation",
"from": "conformer_3_conv_mod_bn",
"activation": "swish",
},
"conformer_3_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_3_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_3_conv_mod_dropout": {
"class": "copy",
"from": "conformer_3_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_3_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_3_conv_mod_dropout", "conformer_3_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_3_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_3_conv_mod_res_add",
},
"conformer_3_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_3_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_3_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_3_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_3_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_3_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_3_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_3_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_3_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_3_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_3_mhsa_mod_dropout", "conformer_3_conv_mod_res_add"],
"kind": "add",
},
"conformer_3_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_3_mhsa_mod_res_add",
},
"conformer_3_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_3_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_3_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_3_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_3_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_3_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_3_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_3_ffmod_2_dropout", "conformer_3_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_3_output": {
"class": "layer_norm",
"from": "conformer_3_ffmod_2_half_res_add",
},
"conformer_4_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_3_output"},
"conformer_4_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_4_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_4_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_4_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_4_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_4_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_4_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_4_ffmod_1_dropout", "conformer_3_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_4_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_4_ffmod_1_half_res_add",
},
"conformer_4_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_4_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_4_conv_mod_glu": {
"class": "gating",
"from": "conformer_4_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_4_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_4_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_4_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_4_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_4_conv_mod_swish": {
"class": "activation",
"from": "conformer_4_conv_mod_bn",
"activation": "swish",
},
"conformer_4_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_4_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_4_conv_mod_dropout": {
"class": "copy",
"from": "conformer_4_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_4_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_4_conv_mod_dropout", "conformer_4_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_4_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_4_conv_mod_res_add",
},
"conformer_4_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_4_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_4_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_4_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_4_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_4_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_4_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_4_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_4_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_4_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_4_mhsa_mod_dropout", "conformer_4_conv_mod_res_add"],
"kind": "add",
},
"conformer_4_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_4_mhsa_mod_res_add",
},
"conformer_4_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_4_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_4_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_4_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_4_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_4_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_4_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_4_ffmod_2_dropout", "conformer_4_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_4_output": {
"class": "layer_norm",
"from": "conformer_4_ffmod_2_half_res_add",
},
"conformer_5_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_4_output"},
"conformer_5_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_5_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_5_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_5_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_5_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_5_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_5_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_5_ffmod_1_dropout", "conformer_4_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_5_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_5_ffmod_1_half_res_add",
},
"conformer_5_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_5_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_5_conv_mod_glu": {
"class": "gating",
"from": "conformer_5_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_5_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_5_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_5_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_5_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_5_conv_mod_swish": {
"class": "activation",
"from": "conformer_5_conv_mod_bn",
"activation": "swish",
},
"conformer_5_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_5_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_5_conv_mod_dropout": {
"class": "copy",
"from": "conformer_5_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_5_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_5_conv_mod_dropout", "conformer_5_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_5_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_5_conv_mod_res_add",
},
"conformer_5_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_5_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_5_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_5_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_5_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_5_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_5_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_5_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_5_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_5_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_5_mhsa_mod_dropout", "conformer_5_conv_mod_res_add"],
"kind": "add",
},
"conformer_5_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_5_mhsa_mod_res_add",
},
"conformer_5_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_5_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_5_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_5_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_5_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_5_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_5_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_5_ffmod_2_dropout", "conformer_5_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_5_output": {
"class": "layer_norm",
"from": "conformer_5_ffmod_2_half_res_add",
},
"conformer_6_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_5_output"},
"conformer_6_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_6_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_6_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_6_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_6_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_6_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_6_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_6_ffmod_1_dropout", "conformer_5_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_6_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_6_ffmod_1_half_res_add",
},
"conformer_6_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_6_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_6_conv_mod_glu": {
"class": "gating",
"from": "conformer_6_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_6_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_6_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_6_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_6_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_6_conv_mod_swish": {
"class": "activation",
"from": "conformer_6_conv_mod_bn",
"activation": "swish",
},
"conformer_6_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_6_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_6_conv_mod_dropout": {
"class": "copy",
"from": "conformer_6_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_6_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_6_conv_mod_dropout", "conformer_6_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_6_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_6_conv_mod_res_add",
},
"conformer_6_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_6_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_6_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_6_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_6_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_6_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_6_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_6_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_6_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_6_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_6_mhsa_mod_dropout", "conformer_6_conv_mod_res_add"],
"kind": "add",
},
"conformer_6_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_6_mhsa_mod_res_add",
},
"conformer_6_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_6_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_6_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_6_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_6_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_6_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_6_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_6_ffmod_2_dropout", "conformer_6_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_6_output": {
"class": "layer_norm",
"from": "conformer_6_ffmod_2_half_res_add",
},
"conformer_7_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_6_output"},
"conformer_7_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_7_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_7_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_7_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_7_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_7_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_7_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_7_ffmod_1_dropout", "conformer_6_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_7_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_7_ffmod_1_half_res_add",
},
"conformer_7_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_7_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_7_conv_mod_glu": {
"class": "gating",
"from": "conformer_7_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_7_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_7_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_7_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_7_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_7_conv_mod_swish": {
"class": "activation",
"from": "conformer_7_conv_mod_bn",
"activation": "swish",
},
"conformer_7_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_7_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_7_conv_mod_dropout": {
"class": "copy",
"from": "conformer_7_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_7_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_7_conv_mod_dropout", "conformer_7_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_7_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_7_conv_mod_res_add",
},
"conformer_7_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_7_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_7_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_7_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_7_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_7_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_7_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_7_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_7_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_7_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_7_mhsa_mod_dropout", "conformer_7_conv_mod_res_add"],
"kind": "add",
},
"conformer_7_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_7_mhsa_mod_res_add",
},
"conformer_7_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_7_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_7_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_7_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_7_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_7_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_7_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_7_ffmod_2_dropout", "conformer_7_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_7_output": {
"class": "layer_norm",
"from": "conformer_7_ffmod_2_half_res_add",
},
"conformer_8_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_7_output"},
"conformer_8_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_8_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_8_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_8_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_8_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_8_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_8_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_8_ffmod_1_dropout", "conformer_7_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_8_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_8_ffmod_1_half_res_add",
},
"conformer_8_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_8_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_8_conv_mod_glu": {
"class": "gating",
"from": "conformer_8_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_8_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_8_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_8_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_8_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_8_conv_mod_swish": {
"class": "activation",
"from": "conformer_8_conv_mod_bn",
"activation": "swish",
},
"conformer_8_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_8_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_8_conv_mod_dropout": {
"class": "copy",
"from": "conformer_8_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_8_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_8_conv_mod_dropout", "conformer_8_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_8_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_8_conv_mod_res_add",
},
"conformer_8_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_8_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_8_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_8_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_8_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_8_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_8_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_8_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_8_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_8_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_8_mhsa_mod_dropout", "conformer_8_conv_mod_res_add"],
"kind": "add",
},
"conformer_8_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_8_mhsa_mod_res_add",
},
"conformer_8_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_8_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_8_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_8_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_8_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_8_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_8_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_8_ffmod_2_dropout", "conformer_8_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_8_output": {
"class": "layer_norm",
"from": "conformer_8_ffmod_2_half_res_add",
},
"conformer_9_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_8_output"},
"conformer_9_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_9_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_9_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_9_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_9_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_9_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_9_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_9_ffmod_1_dropout", "conformer_8_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_9_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_9_ffmod_1_half_res_add",
},
"conformer_9_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_9_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_9_conv_mod_glu": {
"class": "gating",
"from": "conformer_9_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_9_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_9_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_9_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_9_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_9_conv_mod_swish": {
"class": "activation",
"from": "conformer_9_conv_mod_bn",
"activation": "swish",
},
"conformer_9_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_9_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_9_conv_mod_dropout": {
"class": "copy",
"from": "conformer_9_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_9_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_9_conv_mod_dropout", "conformer_9_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_9_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_9_conv_mod_res_add",
},
"conformer_9_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_9_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_9_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_9_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_9_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_9_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_9_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_9_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_9_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_9_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_9_mhsa_mod_dropout", "conformer_9_conv_mod_res_add"],
"kind": "add",
},
"conformer_9_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_9_mhsa_mod_res_add",
},
"conformer_9_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_9_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_9_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_9_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_9_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_9_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_9_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_9_ffmod_2_dropout", "conformer_9_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_9_output": {
"class": "layer_norm",
"from": "conformer_9_ffmod_2_half_res_add",
},
"conformer_10_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_9_output"},
"conformer_10_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_10_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_10_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_10_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_10_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_10_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_10_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_10_ffmod_1_dropout", "conformer_9_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_10_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_10_ffmod_1_half_res_add",
},
"conformer_10_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_10_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_10_conv_mod_glu": {
"class": "gating",
"from": "conformer_10_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_10_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_10_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_10_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_10_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_10_conv_mod_swish": {
"class": "activation",
"from": "conformer_10_conv_mod_bn",
"activation": "swish",
},
"conformer_10_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_10_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_10_conv_mod_dropout": {
"class": "copy",
"from": "conformer_10_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_10_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_10_conv_mod_dropout", "conformer_10_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_10_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_10_conv_mod_res_add",
},
"conformer_10_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_10_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_10_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_10_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_10_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_10_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_10_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_10_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_10_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_10_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_10_mhsa_mod_dropout", "conformer_10_conv_mod_res_add"],
"kind": "add",
},
"conformer_10_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_10_mhsa_mod_res_add",
},
"conformer_10_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_10_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_10_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_10_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_10_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_10_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_10_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_10_ffmod_2_dropout", "conformer_10_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_10_output": {
"class": "layer_norm",
"from": "conformer_10_ffmod_2_half_res_add",
},
"conformer_11_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_10_output"},
"conformer_11_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_11_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_11_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_11_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_11_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_11_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_11_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_11_ffmod_1_dropout", "conformer_10_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_11_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_11_ffmod_1_half_res_add",
},
"conformer_11_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_11_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_11_conv_mod_glu": {
"class": "gating",
"from": "conformer_11_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_11_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_11_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_11_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_11_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_11_conv_mod_swish": {
"class": "activation",
"from": "conformer_11_conv_mod_bn",
"activation": "swish",
},
"conformer_11_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_11_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_11_conv_mod_dropout": {
"class": "copy",
"from": "conformer_11_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_11_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_11_conv_mod_dropout", "conformer_11_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_11_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_11_conv_mod_res_add",
},
"conformer_11_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_11_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_11_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_11_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_11_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_11_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_11_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_11_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_11_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_11_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_11_mhsa_mod_dropout", "conformer_11_conv_mod_res_add"],
"kind": "add",
},
"conformer_11_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_11_mhsa_mod_res_add",
},
"conformer_11_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_11_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_11_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_11_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_11_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_11_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_11_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_11_ffmod_2_dropout", "conformer_11_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_11_output": {
"class": "layer_norm",
"from": "conformer_11_ffmod_2_half_res_add",
},
"conformer_12_ffmod_1_ln": {"class": "layer_norm", "from": "conformer_11_output"},
"conformer_12_ffmod_1_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_12_ffmod_1_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_12_ffmod_1_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_12_ffmod_1_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_12_ffmod_1_dropout": {
"class": "copy",
"from": "conformer_12_ffmod_1_dropout_linear",
"dropout": 0.1,
},
"conformer_12_ffmod_1_half_res_add": {
"class": "eval",
"from": ["conformer_12_ffmod_1_dropout", "conformer_11_output"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_12_conv_mod_ln": {
"class": "layer_norm",
"from": "conformer_12_ffmod_1_half_res_add",
},
"conformer_12_conv_mod_pointwise_conv_1": {
"class": "linear",
"n_out": 1024,
"from": "conformer_12_conv_mod_ln",
"activation": None,
"L2": 0.0001,
},
"conformer_12_conv_mod_glu": {
"class": "gating",
"from": "conformer_12_conv_mod_pointwise_conv_1",
"activation": None,
"gate_activation": "sigmoid",
},
"conformer_12_conv_mod_depthwise_conv": {
"class": "conv",
"from": "conformer_12_conv_mod_glu",
"n_out": 512,
"filter_size": (32,),
"padding": "same",
"with_bias": True,
"activation": None,
"L2": 0.0001,
"groups": 512,
},
"conformer_12_conv_mod_bn": {
"class": "batch_norm",
"from": "conformer_12_conv_mod_depthwise_conv",
"momentum": 0.1,
"epsilon": 1e-05,
"update_sample_only_in_training": True,
"delay_sample_update": True,
},
"conformer_12_conv_mod_swish": {
"class": "activation",
"from": "conformer_12_conv_mod_bn",
"activation": "swish",
},
"conformer_12_conv_mod_pointwise_conv_2": {
"class": "linear",
"n_out": 512,
"from": "conformer_12_conv_mod_swish",
"activation": None,
"L2": 0.0001,
},
"conformer_12_conv_mod_dropout": {
"class": "copy",
"from": "conformer_12_conv_mod_pointwise_conv_2",
"dropout": 0.1,
},
"conformer_12_conv_mod_res_add": {
"class": "combine",
"from": ["conformer_12_conv_mod_dropout", "conformer_12_ffmod_1_half_res_add"],
"kind": "add",
},
"conformer_12_mhsa_mod_ln": {
"class": "layer_norm",
"from": "conformer_12_conv_mod_res_add",
},
"conformer_12_mhsa_mod_relpos_encoding": {
"class": "relative_positional_encoding",
"from": "conformer_12_mhsa_mod_ln",
"n_out": 64,
"clipping": 32,
},
"conformer_12_mhsa_mod_self_attention": {
"class": "self_attention",
"from": "conformer_12_mhsa_mod_ln",
"n_out": 512,
"num_heads": 8,
"total_key_dim": 512,
"key_shift": "conformer_12_mhsa_mod_relpos_encoding",
"attention_dropout": 0.1,
},
"conformer_12_mhsa_mod_att_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_12_mhsa_mod_self_attention",
"activation": None,
"L2": 0.0001,
"with_bias": False,
},
"conformer_12_mhsa_mod_dropout": {
"class": "copy",
"from": "conformer_12_mhsa_mod_att_linear",
"dropout": 0.1,
},
"conformer_12_mhsa_mod_res_add": {
"class": "combine",
"from": ["conformer_12_mhsa_mod_dropout", "conformer_12_conv_mod_res_add"],
"kind": "add",
},
"conformer_12_ffmod_2_ln": {
"class": "layer_norm",
"from": "conformer_12_mhsa_mod_res_add",
},
"conformer_12_ffmod_2_linear_swish": {
"class": "linear",
"n_out": 2048,
"from": "conformer_12_ffmod_2_ln",
"activation": "swish",
"L2": 0.0001,
},
"conformer_12_ffmod_2_dropout_linear": {
"class": "linear",
"n_out": 512,
"from": "conformer_12_ffmod_2_linear_swish",
"activation": None,
"L2": 0.0001,
"dropout": 0.1,
},
"conformer_12_ffmod_2_dropout": {
"class": "copy",
"from": "conformer_12_ffmod_2_dropout_linear",
"dropout": 0.1,
},
"conformer_12_ffmod_2_half_res_add": {
"class": "eval",
"from": ["conformer_12_ffmod_2_dropout", "conformer_12_mhsa_mod_res_add"],
"eval": "0.5 * source(0) + source(1)",
},
"conformer_12_output": {
"class": "layer_norm",
"from": "conformer_12_ffmod_2_half_res_add",
},
"encoder": {"class": "copy", "from": "conformer_12_output"},
"output": {
"class": "softmax",
"from": "encoder",
"loss": "fast_bw",
"loss_opts": {
"sprint_opts": {
"sprintExecPath": "/work/asr4/vieting/programs/rasr/20230707/rasr/arch/linux-x86_64-standard/nn-trainer.linux-x86_64-standard",
"sprintConfigStr": "--*.configuration.channel=output-channel "
"--*.real-time-factor.channel=output-channel --*.system-info.channel=output-channel "
"--*.time.channel=output-channel --*.version.channel=output-channel "
"--*.log.channel=output-channel --*.warning.channel=output-channel, stderr "
"--*.error.channel=output-channel, stderr --*.statistics.channel=output-channel "
"--*.progress.channel=output-channel --*.dot.channel=nil "
"--*.corpus.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_core/datasets/switchboard/CreateSwitchboardBlissCorpusJob.Z1EMi4TdrUS6/output/swb.corpus.xml.gz "
"--*.corpus.segments.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.nrKcBIdsMBZm/output/segments.1 "
"--*.model-combination.lexicon.file=/u/maximilian.kannen/setups/20230406_feat/work/i6_experiments/users/berger/recipe/lexicon/modification/MakeBlankLexiconJob.N8RlHYKzilei/output/lexicon.xml "
"--*.model-combination.acoustic-model.state-tying.type=lookup "
"--*.model-combination.acoustic-model.state-tying.file=/u/vieting/setups/swb/20230406_feat/dependencies/state-tying_blank "
"--*.model-combination.acoustic-model.allophones.add-from-lexicon=no "
"--*.model-combination.acoustic-model.allophones.add-all=yes "
"--*.model-combination.acoustic-model.allophones.add-from-file=/u/vieting/setups/swb/20230406_feat/dependencies/allophones_blank "
"--*.model-combination.acoustic-model.hmm.states-per-phone=1 "
"--*.model-combination.acoustic-model.hmm.state-repetitions=1 "
"--*.model-combination.acoustic-model.hmm.across-word-model=yes "
"--*.model-combination.acoustic-model.hmm.early-recombination=no "
"--*.model-combination.acoustic-model.tdp.scale=1.0 "
"--*.model-combination.acoustic-model.tdp.*.loop=0.0 "
"--*.model-combination.acoustic-model.tdp.*.forward=0.0 "
"--*.model-combination.acoustic-model.tdp.*.skip=infinity "
"--*.model-combination.acoustic-model.tdp.*.exit=0.0 "
"--*.model-combination.acoustic-model.tdp.silence.loop=0.0 "
"--*.model-combination.acoustic-model.tdp.silence.forward=0.0 "
"--*.model-combination.acoustic-model.tdp.silence.skip=infinity "
"--*.model-combination.acoustic-model.tdp.silence.exit=0.0 "
"--*.model-combination.acoustic-model.tdp.entry-m1.loop=infinity "
"--*.model-combination.acoustic-model.tdp.entry-m2.loop=infinity "
"--*.model-combination.acoustic-model.phonology.history-length=0 "
"--*.model-combination.acoustic-model.phonology.future-length=0 "
"--*.transducer-builder-filter-out-invalid-allophones=yes "
"--*.fix-allophone-context-at-word-boundaries=yes "
"--*.allophone-state-graph-builder.topology=ctc "
"--*.allow-for-silence-repetitions=no --action=python-control "
"--python-control-loop-type=python-control-loop --extract-features=no "
"--*.encoding=UTF-8 --*.output-channel.file=$(LOGFILE) "
"--*.output-channel.compressed=no --*.output-channel.append=no "
"--*.output-channel.unbuffered=no --*.LOGFILE=nn-trainer.loss.log --*.TASK=1",
"minPythonControlVersion": 4,
"numInstances": 2,
"usePythonSegmentOrder": False,
},
"tdp_scale": 0.0,
},
"target": None,
"n_out": 88,
},
"features": {
"class": "subnetwork",
"from": ["data"],
"subnetwork": {
"conv_h_filter": {
"class": "variable",
"shape": (128, 1, 150),
"init": "glorot_uniform",
},
"conv_h": {
"class": "conv",
"filter_size": (128,),
"strides": 5,
"n_out": 150,
"padding": "valid",
"filter": "conv_h_filter",
"from": "data",
},
"conv_h_split": {
"class": "split_dims",
"axis": "F",
"dims": (-1, 1),
"from": "conv_h_act",
},
"conv_l": {
"class": "conv",
"filter_size": (40, 1),
"strides": (16, 1),
"n_out": 5,
"padding": "valid",
"from": "conv_h_split",
},
"conv_l_merge": {
"class": "merge_dims",
"axes": "except_time",
"from": "conv_l",
},
"output": {"class": "copy", "from": "conv_l_act"},
"conv_h_act": {
"class": "eval",
"eval": "tf.abs(source(0))",
"from": "conv_h",
},
"conv_l_act": {"class": "layer_norm", "from": ["conv_l_act_no_norm"]},
"conv_l_act_no_norm": {
"class": "eval",
"eval": "tf.pow(tf.abs(source(0)) + 1e-05, 1 / 2.5)",
"from": ["conv_l_merge"],
},
},
"trainable": True,
},
}
newbob_learning_rate_decay = 0.9
newbob_multi_num_epochs = 6
newbob_multi_update_interval = 1
num_epochs = 450
optimizer = {"class": "nadam", "epsilon": 1e-08}
save_interval = 1
target = "classes"
task = "train"
tf_log_memory_usage = True
train = {
"class": "MultiProcDataset",
"dataset": {
"class": "OggZipDataset",
"audio": {"features": "raw", "peak_normalization": True},
"partition_epoch": 6,
"path": [
"/u/maximilian.kannen/setups/20230406_feat/work/i6_core/returnn/oggzip/BlissToOggZipJob.lAFM8R9mzLpI/output/out.ogg.zip"
],
"seq_ordering": "laplace:.384",
"use_cache_manager": True,
"segment_file": "/u/maximilian.kannen/setups/20230406_feat/work/i6_core/corpus/filter/FilterSegmentsByListJob.SVlbt6fqP4Jn/output/segments.1",
"targets": None,
},
"num_workers": 2,
"buffer_size": 5,
}
use_tensorflow = True
watch_memory = True
config = {}
locals().update(**config)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment