smdshakeelhassan/local_win05_uni.config

## local_win05_uni.config
#!crnn/rnn.py
# kate: syntax python;

import os
import numpy
from subprocess import check_output, CalledProcessError
from Pretrain import WrapEpochValue

# task
use_tensorflow = True
task = "train"
device = "gpu"
multiprocessing = True
update_on_device = True
setup_name = os.path.splitext(os.path.basename(__file__))[0]

debug_mode = False
if int(os.environ.get("DEBUG", "0")):
    print("** DEBUG MODE")
    debug_mode = True

if config.has("beam_size"):
    beam_size = config.int("beam_size", 0)
    print("** beam_size %i" % beam_size)
else:
    beam_size = 12

# data
num_inputs = 40
num_outputs = {"classes": (10025, 1), "data": (num_inputs, 2)}  # see vocab
EpochSplit = 20


def get_dataset(key, subset=None, train_partition_epoch=None):
    d = {
        'class': 'LibriSpeechCorpus',
        'path': 'data/dataset',
        "use_zip": True,
        "use_cache_manager": True,
        "prefix": key,
        "bpe": {
            'bpe_file': 'data/dataset/trans.bpe.codes',
            'vocab_file': 'data/dataset/trans.bpe.vocab',
            'seq_postfix': [0],
            'unknown_label': '<unk>'},
        "audio": {
            "norm_mean": "data/dataset/stats.mean.txt",
            "norm_std_dev": "data/dataset/stats.std_dev.txt"},
    }
    if key.startswith("train"):
        d["partition_epoch"] = train_partition_epoch
        if key == "train":
            d["epoch_wise_filter"] = {
                (1, 5): {
                    'max_mean_len': 75,  # chars, should be around 14 bpe labels
                    'subdirs': ['train-clean-100', 'train-clean-360']}}
        #d["audio"]["random_permute"] = True
        num_seqs = 281241  # total
        d["seq_ordering"] = "laplace:%i" % (num_seqs // 1000)
    else:
        d["fixed_random_seed"] = 1
        d["seq_ordering"] = "sorted_reverse"
    if subset:
        d["fixed_random_subset"] = subset  # faster
    return d


train = get_dataset("train", train_partition_epoch=EpochSplit)
dev = get_dataset("dev", subset=3000)
cache_size = "0"
window = 1

# network
# (also defined by num_inputs & num_outputs)
target = "classes"
EncKeyTotalDim = 1024
AttNumHeads = 1
AttWindowSize = 5
EncKeyPerHeadDim = EncKeyTotalDim // AttNumHeads
#EncValueTotalDim = 2048
EncValueTotalDim = 1024
EncValuePerHeadDim = EncValueTotalDim // AttNumHeads
#LstmDim = EncValueTotalDim // 2
LstmDim = EncValueTotalDim

network = {
"source": {"class": "eval", "eval": "tf.clip_by_value(source(0), -3.0, 3.0)"},

"lstm0_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["source"] },
#"lstm0_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["source"] },
#"lstm0_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm0_fw"], "trainable": False},

"lstm1_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm0_fw"], "dropout": 0.3 },
#"lstm1_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm0_pool"], "dropout": 0.3 },
#"lstm1_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm1_fw"], "trainable": False},

"lstm2_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm1_fw"], "dropout": 0.3 },
#"lstm2_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm1_pool"], "dropout": 0.3 },
#"lstm2_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm2_fw"], "trainable": False},

"lstm3_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm2_fw"], "dropout": 0.3 },
#"lstm3_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm2_pool"], "dropout": 0.3 },
#"lstm3_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm3_fw"], "trainable": False},

"lstm4_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm3_fw"], "dropout": 0.3 },
#"lstm4_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm3_pool"], "dropout": 0.3 },
#"lstm4_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm4_fw"], "trainable": False},

"lstm5_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm4_fw"], "dropout": 0.3 },
#"lstm5_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm4_pool"], "dropout": 0.3 },

# "encoder": {"class": "copy", "from": ["lstm5_fw", "lstm5_bw"]},  # dim: EncValueTotalDim
"encoder": {"class": "copy", "from": ["lstm5_fw"]},
"enc_ctx": {"class": "linear", "activation": None, "with_bias": True, "from": ["encoder"], "n_out": EncKeyTotalDim},  # preprocessed_attended in Blocks
"inv_fertility": {"class": "linear", "activation": "sigmoid", "with_bias": False, "from": ["encoder"], "n_out": AttNumHeads},
"enc_value": {"class": "split_dims", "axis": "F", "dims": (AttNumHeads, EncValuePerHeadDim), "from": ["encoder"]},  # (B, enc-T, H, D'/H)

"output": {"class": "rec", "from": [], 'cheating': config.bool("cheating", False), "unit": {
    'output': {'class': 'choice', 'target': target, 'beam_size': beam_size, 'cheating': config.bool("cheating", False), 'from': ["output_prob"], "initial_output": 0},
    "end": {"class": "compare", "from": ["output"], "value": 0},
    'target_embed': {'class': 'linear', 'activation': None, "with_bias": False, 'from': ['output'], "n_out": 621, "initial_output": 0},  # feedback_input
    "weight_feedback": {"class": "linear", "activation": None, "with_bias": False, "from": ["prev:accum_att_weights"], "n_out": EncKeyTotalDim},
    "s_transformed": {"class": "linear", "activation": None, "with_bias": False, "from": ["s"], "n_out": EncKeyTotalDim},

    # (T, B)
    "p_t": {"class": "eval", "from": "p_t_in", "eval": "tf.to_float(source(0))"},
    #"p_t_in": {"class": "eval", "from": "prev:att_weights", "eval": "tf.squeeze(tf.argmax(source(0), axis=1, output_type=tf.int32), axis=1)",
    #  "out_type": {"shape": (), "batch_dim_axis": 0, "dtype": "float32"}},
    "p_t_in": {"class": "reduce", "from": "prev:att_weights", "mode": "argmax", "axis": "t"},
    #"p_t_print": {"class": "eval", "from": "p_t_in", "eval": "tf.Print(source(0), [tf.shape(source(0)),source(0)], \"p_t_in\", summarize=200)"},
    #"p_t": {"class": "eval", "from": "p_t_in", "eval": "tf.maximum(0., source(0)))" % (AttWindowSize // 2),
    #"out_type": {"sparse": False, "shape": (), "dtype": "float32"}, "initial_output": 0},
    #"energy_in_enc_ctx": {"class": "slice_nd", "from": ["base:enc_ctx"], "start": "p_t", "size": AttWindowSize},  # (B, size, 1000)
    "energy_in": {"class": "combine", "kind": "add", "from": ["base:enc_ctx", "weight_feedback", "s_transformed"], "n_out": EncKeyTotalDim},
    "energy_tanh": {"class": "activation", "activation": "tanh", "from": ["energy_in"]},
    "energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"], "n_out": AttNumHeads},  # (B, enc-T, H)
    "energy_reinterpreted": {"class": "reinterpret_data", "enforce_batch_major": True, "from": "energy", "trainable": False},
    "att_weights": {"class": "softmax_over_spatial", "from": ["energy_reinterpreted"], "window_start": "p_t_in", "window_size": AttWindowSize},  # (B, enc-T, H)
    #"att_weights_print": {"class": "eval", "from": "att_weights", "eval": "tf.Print(source(0), [tf.shape(source(0)), source(0)], summarize=99)"},
    #"att_weights": {"class": "softmax_over_spatial", "from": ["energy"]},  # (B, enc-T, H)
    # (B, T, H) + (B, T, H)
    "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
        "eval": "source(0) + source(1) * source(2) * 0.5", "out_type": {"dim": AttNumHeads, "shape": (None, AttNumHeads)}},
    "att0": {"class": "generic_attention", "weights": "att_weights", "base": "base:enc_value"},  # (B, H, V)
    "att": {"class": "merge_dims", "axes": "except_batch", "from": ["att0"]},  # (B, H*V)
    "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:target_embed", "prev:att"], "n_out": 1000},  # transform
    "readout_in": {"class": "linear", "from": ["s", "prev:target_embed", "att"], "activation": None, "n_out": 1000},  # merge + post_merge bias
    "readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]},
    "output_prob": {
        "class": "softmax", "from": ["readout"], "dropout": 0.3,
        "target": target, "loss": "ce", "loss_opts": {"label_smoothing": 0.1},
        "loss_only_on_non_search": True},
}, "target": target, "max_seq_len": "max_len_from('base:encoder')"},

"decision": {
    "class": "decide", "from": ["output"], "loss": "edit_distance", "target": target,
    "loss_only_on_non_search": False,
    },

"ctc": {"class": "softmax", "from": ["encoder"], "loss": "ctc", "target": target,
    "loss_opts": {"beam_width": 1, "ctc_opts": {"ignore_longer_outputs_than_inputs": True}}}
}

search_output_layer = "decision"
debug_print_layer_output_template = True
#debug_print_layer_output_shape = True

# trainer
batching = "random"
log_batch_size = True
batch_size = 20000
max_seqs = 200
max_seq_length = {"classes": 75}
#chunking = ""  # no chunking
truncation = -1

def custom_construction_algo(idx, net_dict):
    # For debugging, use: python3 ./crnn/Pretrain.py config... Maybe set repetitions=1 below.
    # We will first construct layer-by-layer, starting with 2 layers.
    # Initially, we will use a higher reduction factor, and at the end, we will reduce it.
    # Also, we will initially have not label smoothing.
    orig_num_lstm_layers = 0
    while "lstm%i_fw" % orig_num_lstm_layers in net_dict:
        orig_num_lstm_layers += 1
    assert orig_num_lstm_layers >= 2
    orig_red_factor = 1
    #for i in range(orig_num_lstm_layers - 1):
    #    orig_red_factor *= net_dict["lstm%i_pool" % i]["pool_size"][0]
    num_lstm_layers = idx + 2  # idx starts at 0. start with 2 layers
    if idx == 0:
        net_dict["lstm%i_fw" % (orig_num_lstm_layers - 1)]["dropout"] = 0
        #net_dict["lstm%i_bw" % (orig_num_lstm_layers - 1)]["dropout"] = 0
    if idx >= 1:
        num_lstm_layers -= 1  # repeat like idx=0, but now with dropout
    # We will start with a higher reduction factor initially, for better convergence.
    red_factor = 2 ** 5
    if num_lstm_layers == orig_num_lstm_layers + 1:
        # Use original reduction factor now.
        num_lstm_layers = orig_num_lstm_layers
        red_factor = orig_red_factor
    if num_lstm_layers > orig_num_lstm_layers:
        # Finish. This will also use label-smoothing then.
        return None
    # Use label smoothing only at the very end.
    net_dict["output"]["unit"]["output_prob"]["loss_opts"]["label_smoothing"] = 0
    # Other options during pretraining.
    if idx == 0:
      net_dict["#config"] = {"max_seq_length": {"classes": 60}}
      net_dict["#repetition"] = 10
    # Leave the last lstm layer as-is, but only modify its source.
    net_dict["lstm%i_fw" % (orig_num_lstm_layers - 1)]["from"] = ["lstm%i_fw" % (num_lstm_layers - 2)]
    #net_dict["lstm%i_bw" % (orig_num_lstm_layers - 1)]["from"] = ["lstm%i_pool" % (num_lstm_layers - 2)]
    #if red_factor > orig_red_factor:
        #for i in range(num_lstm_layers - 2):
        #    net_dict["lstm%i_pool" % i]["pool_size"] = (1,)
        # Increase last pool-size to get the initial reduction factor.
        #assert red_factor % (1 ** (num_lstm_layers - 2)) == 0
        # last_pool_size = red_factor // (2 ** (num_lstm_layers - 2))
        #last_pool_size = 1
	# Increase last pool-size to get the same encoder-seq-length folding.
        #net_dict["lstm%i_pool" % (num_lstm_layers - 2)]["pool_size"] = (last_pool_size,)
    # Deete non-used lstm layers. This is not explicitly necessary but maybe nicer.
    for i in range(num_lstm_layers - 1, orig_num_lstm_layers - 1):
        del net_dict["lstm%i_fw" % i]
        # del net_dict["lstm%i_bw" % i]
        #del net_dict["lstm%i_pool" % i]
    return net_dict

pretrain = {"repetitions": 1, "construction_algo": custom_construction_algo}

num_epochs = 270
model = "data/exp-%s/model" % setup_name
#model = "net-model/network"
cleanup_old_models = True
gradient_clip = 0
#gradient_clip_global_norm = 1.0
adam = True
optimizer_epsilon = 1e-8
#debug_add_check_numerics_ops = True
#debug_add_check_numerics_on_output = True
stop_on_nonfinite_train_score = False
tf_log_memory_usage = True
gradient_noise = 0.0
learning_rate = 0.00001
learning_rate_control = "newbob_multi_epoch"
#learning_rate_control_error_measure = "dev_score_output"
learning_rate_control_relative_error_relative_lr = True
learning_rate_control_min_num_epochs_per_new_lr = 3
use_learning_rate_control_always = True
newbob_multi_num_epochs = 2
newbob_multi_update_interval = 1
newbob_learning_rate_decay = 0.9
learning_rate_file = "data/exp-%s/train-scores.data" % setup_name
#learning_rate_file = "newbob.data"

# log
#log = "| /u/zeyer/dotfiles/system-tools/bin/mt-cat.py >> log/crnn.seq-train.%s.log" % task
log = "data/exp-%s/returnn.%s.$date.log" % (setup_name, task)
#log = "log/crnn.%s.log" % task
log_verbosity = 5
	#!crnn/rnn.py
	# kate: syntax python;

	import os
	import numpy
	from subprocess import check_output, CalledProcessError
	from Pretrain import WrapEpochValue

	# task
	use_tensorflow = True
	task = "train"
	device = "gpu"
	multiprocessing = True
	update_on_device = True
	setup_name = os.path.splitext(os.path.basename(__file__))[0]

	debug_mode = False
	if int(os.environ.get("DEBUG", "0")):
	print("** DEBUG MODE")
	debug_mode = True

	if config.has("beam_size"):
	beam_size = config.int("beam_size", 0)
	print("** beam_size %i" % beam_size)
	else:
	beam_size = 12

	# data
	num_inputs = 40
	num_outputs = {"classes": (10025, 1), "data": (num_inputs, 2)} # see vocab
	EpochSplit = 20


	def get_dataset(key, subset=None, train_partition_epoch=None):
	d = {
	'class': 'LibriSpeechCorpus',
	'path': 'data/dataset',
	"use_zip": True,
	"use_cache_manager": True,
	"prefix": key,
	"bpe": {
	'bpe_file': 'data/dataset/trans.bpe.codes',
	'vocab_file': 'data/dataset/trans.bpe.vocab',
	'seq_postfix': [0],
	'unknown_label': '<unk>'},
	"audio": {
	"norm_mean": "data/dataset/stats.mean.txt",
	"norm_std_dev": "data/dataset/stats.std_dev.txt"},
	}
	if key.startswith("train"):
	d["partition_epoch"] = train_partition_epoch
	if key == "train":
	d["epoch_wise_filter"] = {
	(1, 5): {
	'max_mean_len': 75, # chars, should be around 14 bpe labels
	'subdirs': ['train-clean-100', 'train-clean-360']}}
	#d["audio"]["random_permute"] = True
	num_seqs = 281241 # total
	d["seq_ordering"] = "laplace:%i" % (num_seqs // 1000)
	else:
	d["fixed_random_seed"] = 1
	d["seq_ordering"] = "sorted_reverse"
	if subset:
	d["fixed_random_subset"] = subset # faster
	return d


	train = get_dataset("train", train_partition_epoch=EpochSplit)
	dev = get_dataset("dev", subset=3000)
	cache_size = "0"
	window = 1

	# network
	# (also defined by num_inputs & num_outputs)
	target = "classes"
	EncKeyTotalDim = 1024
	AttNumHeads = 1
	AttWindowSize = 5
	EncKeyPerHeadDim = EncKeyTotalDim // AttNumHeads
	#EncValueTotalDim = 2048
	EncValueTotalDim = 1024
	EncValuePerHeadDim = EncValueTotalDim // AttNumHeads
	#LstmDim = EncValueTotalDim // 2
	LstmDim = EncValueTotalDim

	network = {
	"source": {"class": "eval", "eval": "tf.clip_by_value(source(0), -3.0, 3.0)"},

	"lstm0_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["source"] },
	#"lstm0_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["source"] },
	#"lstm0_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm0_fw"], "trainable": False},

	"lstm1_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm0_fw"], "dropout": 0.3 },
	#"lstm1_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm0_pool"], "dropout": 0.3 },
	#"lstm1_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm1_fw"], "trainable": False},

	"lstm2_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm1_fw"], "dropout": 0.3 },
	#"lstm2_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm1_pool"], "dropout": 0.3 },
	#"lstm2_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm2_fw"], "trainable": False},

	"lstm3_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm2_fw"], "dropout": 0.3 },
	#"lstm3_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm2_pool"], "dropout": 0.3 },
	#"lstm3_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm3_fw"], "trainable": False},

	"lstm4_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm3_fw"], "dropout": 0.3 },
	#"lstm4_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm3_pool"], "dropout": 0.3 },
	#"lstm4_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm4_fw"], "trainable": False},

	"lstm5_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm4_fw"], "dropout": 0.3 },
	#"lstm5_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm4_pool"], "dropout": 0.3 },

	# "encoder": {"class": "copy", "from": ["lstm5_fw", "lstm5_bw"]}, # dim: EncValueTotalDim
	"encoder": {"class": "copy", "from": ["lstm5_fw"]},
	"enc_ctx": {"class": "linear", "activation": None, "with_bias": True, "from": ["encoder"], "n_out": EncKeyTotalDim}, # preprocessed_attended in Blocks
	"inv_fertility": {"class": "linear", "activation": "sigmoid", "with_bias": False, "from": ["encoder"], "n_out": AttNumHeads},
	"enc_value": {"class": "split_dims", "axis": "F", "dims": (AttNumHeads, EncValuePerHeadDim), "from": ["encoder"]}, # (B, enc-T, H, D'/H)

	"output": {"class": "rec", "from": [], 'cheating': config.bool("cheating", False), "unit": {
	'output': {'class': 'choice', 'target': target, 'beam_size': beam_size, 'cheating': config.bool("cheating", False), 'from': ["output_prob"], "initial_output": 0},
	"end": {"class": "compare", "from": ["output"], "value": 0},
	'target_embed': {'class': 'linear', 'activation': None, "with_bias": False, 'from': ['output'], "n_out": 621, "initial_output": 0}, # feedback_input
	"weight_feedback": {"class": "linear", "activation": None, "with_bias": False, "from": ["prev:accum_att_weights"], "n_out": EncKeyTotalDim},
	"s_transformed": {"class": "linear", "activation": None, "with_bias": False, "from": ["s"], "n_out": EncKeyTotalDim},

	# (T, B)
	"p_t": {"class": "eval", "from": "p_t_in", "eval": "tf.to_float(source(0))"},
	#"p_t_in": {"class": "eval", "from": "prev:att_weights", "eval": "tf.squeeze(tf.argmax(source(0), axis=1, output_type=tf.int32), axis=1)",
	# "out_type": {"shape": (), "batch_dim_axis": 0, "dtype": "float32"}},
	"p_t_in": {"class": "reduce", "from": "prev:att_weights", "mode": "argmax", "axis": "t"},
	#"p_t_print": {"class": "eval", "from": "p_t_in", "eval": "tf.Print(source(0), [tf.shape(source(0)),source(0)], \"p_t_in\", summarize=200)"},
	#"p_t": {"class": "eval", "from": "p_t_in", "eval": "tf.maximum(0., source(0)))" % (AttWindowSize // 2),
	#"out_type": {"sparse": False, "shape": (), "dtype": "float32"}, "initial_output": 0},
	#"energy_in_enc_ctx": {"class": "slice_nd", "from": ["base:enc_ctx"], "start": "p_t", "size": AttWindowSize}, # (B, size, 1000)
	"energy_in": {"class": "combine", "kind": "add", "from": ["base:enc_ctx", "weight_feedback", "s_transformed"], "n_out": EncKeyTotalDim},
	"energy_tanh": {"class": "activation", "activation": "tanh", "from": ["energy_in"]},
	"energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"], "n_out": AttNumHeads}, # (B, enc-T, H)
	"energy_reinterpreted": {"class": "reinterpret_data", "enforce_batch_major": True, "from": "energy", "trainable": False},
	"att_weights": {"class": "softmax_over_spatial", "from": ["energy_reinterpreted"], "window_start": "p_t_in", "window_size": AttWindowSize}, # (B, enc-T, H)
	#"att_weights_print": {"class": "eval", "from": "att_weights", "eval": "tf.Print(source(0), [tf.shape(source(0)), source(0)], summarize=99)"},
	#"att_weights": {"class": "softmax_over_spatial", "from": ["energy"]}, # (B, enc-T, H)
	# (B, T, H) + (B, T, H)
	"accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
	"eval": "source(0) + source(1) * source(2) * 0.5", "out_type": {"dim": AttNumHeads, "shape": (None, AttNumHeads)}},
	"att0": {"class": "generic_attention", "weights": "att_weights", "base": "base:enc_value"}, # (B, H, V)
	"att": {"class": "merge_dims", "axes": "except_batch", "from": ["att0"]}, # (B, H*V)
	"s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:target_embed", "prev:att"], "n_out": 1000}, # transform
	"readout_in": {"class": "linear", "from": ["s", "prev:target_embed", "att"], "activation": None, "n_out": 1000}, # merge + post_merge bias
	"readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]},
	"output_prob": {
	"class": "softmax", "from": ["readout"], "dropout": 0.3,
	"target": target, "loss": "ce", "loss_opts": {"label_smoothing": 0.1},
	"loss_only_on_non_search": True},
	}, "target": target, "max_seq_len": "max_len_from('base:encoder')"},

	"decision": {
	"class": "decide", "from": ["output"], "loss": "edit_distance", "target": target,
	"loss_only_on_non_search": False,
	},

	"ctc": {"class": "softmax", "from": ["encoder"], "loss": "ctc", "target": target,
	"loss_opts": {"beam_width": 1, "ctc_opts": {"ignore_longer_outputs_than_inputs": True}}}
	}

	search_output_layer = "decision"
	debug_print_layer_output_template = True
	#debug_print_layer_output_shape = True

	# trainer
	batching = "random"
	log_batch_size = True
	batch_size = 20000
	max_seqs = 200
	max_seq_length = {"classes": 75}
	#chunking = "" # no chunking
	truncation = -1

	def custom_construction_algo(idx, net_dict):
	# For debugging, use: python3 ./crnn/Pretrain.py config... Maybe set repetitions=1 below.
	# We will first construct layer-by-layer, starting with 2 layers.
	# Initially, we will use a higher reduction factor, and at the end, we will reduce it.
	# Also, we will initially have not label smoothing.
	orig_num_lstm_layers = 0
	while "lstm%i_fw" % orig_num_lstm_layers in net_dict:
	orig_num_lstm_layers += 1
	assert orig_num_lstm_layers >= 2
	orig_red_factor = 1
	#for i in range(orig_num_lstm_layers - 1):
	# orig_red_factor *= net_dict["lstm%i_pool" % i]["pool_size"][0]
	num_lstm_layers = idx + 2 # idx starts at 0. start with 2 layers
	if idx == 0:
	net_dict["lstm%i_fw" % (orig_num_lstm_layers - 1)]["dropout"] = 0
	#net_dict["lstm%i_bw" % (orig_num_lstm_layers - 1)]["dropout"] = 0
	if idx >= 1:
	num_lstm_layers -= 1 # repeat like idx=0, but now with dropout
	# We will start with a higher reduction factor initially, for better convergence.
	red_factor = 2 ** 5
	if num_lstm_layers == orig_num_lstm_layers + 1:
	# Use original reduction factor now.
	num_lstm_layers = orig_num_lstm_layers
	red_factor = orig_red_factor
	if num_lstm_layers > orig_num_lstm_layers:
	# Finish. This will also use label-smoothing then.
	return None
	# Use label smoothing only at the very end.
	net_dict["output"]["unit"]["output_prob"]["loss_opts"]["label_smoothing"] = 0
	# Other options during pretraining.
	if idx == 0:
	net_dict["#config"] = {"max_seq_length": {"classes": 60}}
	net_dict["#repetition"] = 10
	# Leave the last lstm layer as-is, but only modify its source.
	net_dict["lstm%i_fw" % (orig_num_lstm_layers - 1)]["from"] = ["lstm%i_fw" % (num_lstm_layers - 2)]
	#net_dict["lstm%i_bw" % (orig_num_lstm_layers - 1)]["from"] = ["lstm%i_pool" % (num_lstm_layers - 2)]
	#if red_factor > orig_red_factor:
	#for i in range(num_lstm_layers - 2):
	# net_dict["lstm%i_pool" % i]["pool_size"] = (1,)
	# Increase last pool-size to get the initial reduction factor.
	#assert red_factor % (1 ** (num_lstm_layers - 2)) == 0
	# last_pool_size = red_factor // (2 ** (num_lstm_layers - 2))
	#last_pool_size = 1
	# Increase last pool-size to get the same encoder-seq-length folding.
	#net_dict["lstm%i_pool" % (num_lstm_layers - 2)]["pool_size"] = (last_pool_size,)
	# Deete non-used lstm layers. This is not explicitly necessary but maybe nicer.
	for i in range(num_lstm_layers - 1, orig_num_lstm_layers - 1):
	del net_dict["lstm%i_fw" % i]
	# del net_dict["lstm%i_bw" % i]
	#del net_dict["lstm%i_pool" % i]
	return net_dict

	pretrain = {"repetitions": 1, "construction_algo": custom_construction_algo}

	num_epochs = 270
	model = "data/exp-%s/model" % setup_name
	#model = "net-model/network"
	cleanup_old_models = True
	gradient_clip = 0
	#gradient_clip_global_norm = 1.0
	adam = True
	optimizer_epsilon = 1e-8
	#debug_add_check_numerics_ops = True
	#debug_add_check_numerics_on_output = True
	stop_on_nonfinite_train_score = False
	tf_log_memory_usage = True
	gradient_noise = 0.0
	learning_rate = 0.00001
	learning_rate_control = "newbob_multi_epoch"
	#learning_rate_control_error_measure = "dev_score_output"
	learning_rate_control_relative_error_relative_lr = True
	learning_rate_control_min_num_epochs_per_new_lr = 3
	use_learning_rate_control_always = True
	newbob_multi_num_epochs = 2
	newbob_multi_update_interval = 1
	newbob_learning_rate_decay = 0.9
	learning_rate_file = "data/exp-%s/train-scores.data" % setup_name
	#learning_rate_file = "newbob.data"

	# log
	#log = "\| /u/zeyer/dotfiles/system-tools/bin/mt-cat.py >> log/crnn.seq-train.%s.log" % task
	log = "data/exp-%s/returnn.%s.$date.log" % (setup_name, task)
	#log = "log/crnn.%s.log" % task
	log_verbosity = 5