-
-
Save Max-Ryujin/690647f79773d6cd8338c524be039040 to your computer and use it in GitHub Desktop.
Config for the setup. The relevant function is run_scf_baseline_big()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import copy | |
import os.path | |
from sisyphus import tk, gs | |
from sisyphus.delayed_ops import DelayedFormat | |
from i6_core.meta.system import CorpusObject | |
from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob | |
from i6_core.returnn.config import CodeWrapper | |
from i6_core.recognition import Hub5ScoreJob | |
from i6_experiments.common.datasets.switchboard.corpus_eval import get_hub5e00 | |
from i6_experiments.common.setups.rasr.util import RasrDataInput | |
from i6_experiments.users.berger.recipe.lexicon.modification import DeleteEmptyOrthJob, MakeBlankLexiconJob | |
from i6_experiments.users.vieting.tools.report import Report | |
# TODO: run_gmm_system_from_common might be copied here for stability | |
from i6_experiments.users.vieting.experiments.switchboard.hybrid.feat.experiments import run_gmm_system_from_common | |
from i6_experiments.users.vieting.experiments.switchboard.ctc.feat.transducer_system_v2 import ( | |
TransducerSystem, | |
ReturnnConfigs, | |
ScorerInfo, | |
SearchTypes, | |
) | |
from .baseline_args import get_nn_args as get_nn_args_baseline | |
from .data import get_corpus_data_inputs_oggzip # TODO: might be copied here for stability | |
from .default_tools import RASR_BINARY_PATH, RETURNN_ROOT, RETURNN_EXE, SCTK_BINARY_PATH | |
def get_datasets(**kwargs): | |
gmm_system = run_gmm_system_from_common() | |
# TODO: get oggzip independent of GMM system | |
# noinspection PyTypeChecker | |
( | |
nn_train_data_inputs, | |
nn_cv_data_inputs, | |
nn_devtrain_data_inputs, | |
nn_dev_data_inputs, | |
nn_test_data_inputs, | |
train_corpus_path, | |
traincv_segments, | |
) = get_corpus_data_inputs_oggzip( | |
gmm_system, | |
partition_epoch={"train": 6, "dev": 1}, | |
returnn_root=RETURNN_ROOT, | |
returnn_python_exe=RETURNN_EXE, | |
**kwargs, | |
) | |
returnn_datasets = { | |
"train":{ | |
"class": "MultiProcDataset", | |
"dataset": nn_train_data_inputs["switchboard.train"].get_data_dict()["datasets"]["ogg"], | |
"num_workers": 2, | |
"buffer_size": 5, | |
}, | |
"dev": nn_cv_data_inputs["switchboard.cv"].get_data_dict()["datasets"]["ogg"], | |
"eval_datasets": { | |
"devtrain": nn_devtrain_data_inputs["switchboard.devtrain"].get_data_dict()["datasets"]["ogg"], | |
}, | |
} | |
lexicon = gmm_system.crp["switchboard"].lexicon_config.file | |
lexicon = DeleteEmptyOrthJob(lexicon).out_lexicon | |
rasr_loss_lexicon = MakeBlankLexiconJob(lexicon).out_lexicon | |
nonword_phones = ["[LAUGHTER]", "[NOISE]", "[VOCALIZEDNOISE]"] | |
recog_lexicon = AddEowPhonemesToLexiconJob(rasr_loss_lexicon, nonword_phones=nonword_phones).out_lexicon | |
rasr_loss_corpus = train_corpus_path | |
rasr_loss_segments = traincv_segments | |
hub5e00 = get_hub5e00() | |
corpus_object = CorpusObject() | |
corpus_object.corpus_file = hub5e00.bliss_corpus | |
corpus_object.audio_format = nn_dev_data_inputs["hub5e00"].crp.audio_format | |
corpus_object.duration = nn_dev_data_inputs["hub5e00"].crp.corpus_duration | |
dev_corpora = { | |
"hub5e00": RasrDataInput( | |
corpus_object=corpus_object, | |
lexicon={ | |
"filename": recog_lexicon, | |
"normalize_pronunciation": False, | |
"add_all": True, | |
"add_from_lexicon": False, | |
}, | |
lm={"filename": nn_dev_data_inputs["hub5e00"].crp.language_model_config.file, "type": "ARPA"}, | |
stm=hub5e00.stm, | |
glm=hub5e00.glm, | |
) | |
} | |
return returnn_datasets, rasr_loss_corpus, rasr_loss_segments, rasr_loss_lexicon, dev_corpora | |
def run_test_mel(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 10000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
} | |
feature_args = {"class": "LogMelNetwork", "wavenorm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} | |
returnn_datasets_laplace25 = copy.deepcopy(returnn_datasets) | |
returnn_datasets_laplace25["train"]["seq_ordering"] = "laplace:.25" | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
# "lgm80_conf-simon": dict( | |
# returnn_args={"conformer_type": "simon", **returnn_args}, | |
# feature_args=feature_args, | |
# ), | |
"lgm80_conf-wei_old-lr": dict( | |
returnn_args={"conformer_type": "wei", **returnn_args}, | |
feature_args=feature_args, | |
report_args={"architecture": "conf-wei", "lr": "default"}, | |
), | |
"lgm80_conf-wei_old-lr-4e-4": dict( | |
returnn_args={"conformer_type": "wei", **returnn_args}, | |
feature_args=feature_args, | |
lr_args={"peak_lr": 4e-4}, | |
report_args={"architecture": "conf-wei", "lr": "default_peak_4e-4"}, | |
), | |
"lgm80_conf-wei": dict( # matches original lr schedule from wei | |
returnn_args={"conformer_type": "wei", **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 1, | |
"decrease_epochs": 120, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4"}, | |
), | |
"lgm80_conf-wei2": dict( # almost matches original lr schedule from wei | |
returnn_args={"conformer_type": "wei", **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4"}, | |
), | |
"lgm80_conf-wei-oldspecaug": dict( # specaugment as in wei's setup | |
returnn_args={"conformer_type": "wei", "specaug_old": {}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4", "specaug": "wei"}, | |
), | |
"lgm80_conf-wei-oldspecaug2": dict( # specaugment as in wei's setup but double feature dim due to log Mel | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4", "specaug": "wei_adapt_80dim"}, | |
), | |
"lgm80_conf-wei-oldspecaug-bs3200step": dict( | |
returnn_args={ | |
"conformer_type": "wei", | |
"specaug_old": {}, | |
**returnn_args, | |
"batch_size": {"data": 514635}, | |
}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4", "specaug": "wei"}, | |
), | |
"lgm80_conf-wei-oldspecaug2-bs3200step": dict( | |
returnn_args={ | |
"conformer_type": "wei", | |
"specaug_old": {"max_feature": 8}, | |
**returnn_args, | |
"batch_size": {"data": 514635}, | |
}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4", "specaug": "wei_adapt_80dim"}, | |
), | |
"lgm80_conf-wei-oldspecaug-laplace25": dict( | |
returnn_args={ | |
"conformer_type": "wei", | |
"specaug_old": {}, | |
**returnn_args, | |
"datasets": returnn_datasets_laplace25, | |
}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4", "specaug": "wei"}, | |
), | |
"lgm80_conf-wei-oldspecaug2-lrv1": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 2 * 4e-4, | |
"start_lr": 2 * 1.325e-05, | |
"end_lr": 2 * 1e-5, | |
"increase_epochs": 119, | |
"peak_epochs": 2, | |
"decrease_epochs": 119, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_8e-4", "specaug": "wei_adapt_80dim"}, | |
), | |
# "lgm80_conf-wei2-nadam": dict( # does not work well | |
# returnn_args={ | |
# "conformer_type": "wei", | |
# "extra_args": {"optimizer": None, "optimizer_epsilon": 1e-8, "nadam": True}, | |
# **returnn_args, | |
# }, | |
# feature_args=feature_args, | |
# lr_args={ | |
# "peak_lr": 4e-4, "start_lr": 1.325e-05, "end_lr": 1e-5, | |
# "increase_epochs": 119, "peak_epochs": 2, "decrease_epochs": 119, "final_epochs": 0, | |
# }, | |
# ), | |
# "gt40_pe_conf-wei_old-lr": dict( # does not converge | |
# returnn_args={"conformer_type": "wei", **returnn_args}, | |
# feature_args={ | |
# "class": "GammatoneNetwork", "sample_rate": 8000, "freq_max": 3800., "output_dim": 40, | |
# "preemphasis": 1.0, | |
# }, | |
# ), | |
# "acc2_scf750_conf-wei_old-lr": dict( # does not converge | |
# returnn_args={ | |
# "conformer_type": "wei", | |
# **returnn_args, | |
# "batch_size": 5000, | |
# "extra_args": {"accum_grad_multiple_step": 2}, | |
# }, | |
# feature_args={"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2}, | |
# ), | |
}, | |
num_epochs=300, | |
prefix="conformer_bs10k_", | |
) | |
returnn_configs = {} | |
for exp in nn_args.returnn_training_configs: | |
prior_config = copy.deepcopy(nn_args.returnn_training_configs[exp]) | |
prior_config.config["batch_size"] = prior_config.config["batch_size"]["data"] | |
assert isinstance(prior_config.config["batch_size"], int) | |
returnn_configs[exp] = ReturnnConfigs( | |
train_config=nn_args.returnn_training_configs[exp], | |
prior_config=prior_config, | |
recog_configs={"recog": nn_args.returnn_recognition_configs[exp]}, | |
) | |
recog_args = { | |
"lm_scales": [0.7], | |
"prior_scales": [0.3, 0.5], | |
"epochs": [300], | |
"lookahead_options": {"lm_lookahead_scale": 0.7}, | |
"label_scorer_args": { | |
"use_prior": True, | |
"extra_args": {"blank_label_index": 0}, | |
}, | |
"label_tree_args": {"skip_silence": True}, | |
"search_parameters": { | |
"allow-blank-label": True, | |
"allow-label-loop": True, | |
"allow-label-recombination": True, | |
"allow-word-end-recombination": True, | |
"create-lattice": True, | |
"label-pruning": 11.2, | |
"label-pruning-limit": 100000, | |
"word-end-pruning": 0.5, | |
"word-end-pruning-limit": 10000, | |
}, | |
} | |
score_info = ScorerInfo() | |
score_info.ref_file = dev_corpora["hub5e00"].stm | |
score_info.job_type = Hub5ScoreJob | |
score_info.score_kwargs = {"glm": dev_corpora["hub5e00"].glm, "sctk_binary_path": SCTK_BINARY_PATH} | |
ctc_nn_system = TransducerSystem( | |
returnn_root=RETURNN_ROOT, | |
returnn_python_exe=RETURNN_EXE, | |
rasr_binary_path=RASR_BINARY_PATH, | |
require_native_lstm=False, | |
) | |
ctc_nn_system.init_system( | |
returnn_configs=returnn_configs, | |
dev_keys=["hub5e00"], | |
corpus_data=dev_corpora, | |
am_args={ | |
"state_tying": "monophone", | |
"states_per_phone": 1, | |
"tdp_transition": (0, 0, 0, 0), | |
"tdp_silence": (0, 0, 0, 0), | |
"phon_history_length": 0, | |
"phon_future_length": 0, | |
}, | |
scorer_info=score_info, | |
report=Report( | |
columns_start=["train_name"], | |
columns_end=["lm_scale", "prior_scale", "sub", "del", "ins", "wer"], | |
), | |
) | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_from_lexicon = False | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_all = True | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_from_file = tk.Path( | |
"/u/vieting/setups/swb/20230406_feat/dependencies/allophones_blank", | |
hash_overwrite="SWB_ALLOPHONE_FILE_WEI_BLANK", | |
cached=True, | |
) | |
ctc_nn_system.run_train_step(nn_args.training_args) | |
ctc_nn_system.run_dev_recog_step(recog_args=recog_args, report_args=report_args_collection) | |
ctc_nn_system.run_recogs_for_corpora( # test for single model with larger prior_scales | |
["hub5e00"], | |
"conformer_bs10k_lgm80_conf-wei_old-lr-4e-4", | |
search_type=SearchTypes.GenericSeq2SeqSearchJob, | |
report_args=report_args_collection, | |
**{**recog_args, "prior_scales": [0.4, 0.6, 0.7, 0.9]}, | |
) | |
ctc_nn_system.run_recogs_for_corpora( # test for single model with larger prior_scales | |
["hub5e00"], | |
"conformer_bs10k_lgm80_conf-wei_old-lr", | |
search_type=SearchTypes.GenericSeq2SeqSearchJob, | |
report_args=report_args_collection, | |
**{**recog_args, "prior_scales": [0.5], "epochs": [260]}, | |
) | |
# test blank penalty as we have more deletions than insertions | |
ctc_nn_system_blank_penalty = copy.deepcopy(ctc_nn_system) | |
exp_name = "conformer_bs10k_lgm80_conf-wei_old-lr-4e-4" | |
recog_config = ctc_nn_system_blank_penalty.returnn_configs[exp_name].recog_configs.pop("recog") | |
for blank_penalty in [0.2, 0.3, 0.5, 0.8]: | |
config = copy.deepcopy(recog_config) | |
blank_index = 0 | |
num_outputs = 88 | |
config.config["network"]["output_blank_penalty"] = { | |
"class": "eval", | |
"from": "output", | |
"is_output_layer": True, | |
"eval": f"source(0) - tf.expand_dims(" | |
f"tf.one_hot([{blank_index}], {num_outputs}, on_value={blank_penalty}, dtype=tf.float32), axis=0)", | |
} | |
ctc_nn_system_blank_penalty.returnn_configs[exp_name].recog_configs[ | |
f"recog_blank-penalty-{blank_penalty}" | |
] = config | |
ctc_nn_system_blank_penalty.run_recogs_for_corpora( | |
["hub5e00"], | |
exp_name, | |
search_type=SearchTypes.GenericSeq2SeqSearchJob, | |
report_args=report_args_collection, | |
extra_name=f"_blank-pen-{blank_penalty}", | |
tf_flow_args={"output_layer_name": "output_blank_penalty"}, | |
**{**recog_args, "prior_scales": [0.5]}, | |
) | |
# same lm as in wei's setup, results indicate that this is not better (if any, slightly worse) | |
ctc_nn_system_wei_lm = copy.deepcopy(ctc_nn_system) | |
for train_name in list(ctc_nn_system_wei_lm.returnn_configs.keys()): | |
if train_name not in ["conformer_bs10k_lgm80_conf-wei2", "conformer_bs10k_lgm80_conf-wei_old-lr-4e-4"]: | |
# only use some trainings | |
ctc_nn_system_wei_lm.returnn_configs.pop(train_name) | |
report_args_wei_lm = copy.deepcopy(report_args_collection) | |
for name in report_args_wei_lm: | |
report_args_wei_lm[name]["lm"] = "wei" | |
wei_lm = tk.Path( | |
"/u/vieting/setups/swb/20230406_feat/dependencies/zoltan_4gram.gz", | |
hash_overwrite="ZOLTAN_SWB_LM_4GRAM", | |
cached=True, | |
) | |
ctc_nn_system_wei_lm.corpus_data["hub5e00"].lm["filename"] = wei_lm | |
ctc_nn_system_wei_lm.crp["hub5e00"].language_model_config.file = wei_lm | |
ctc_nn_system_wei_lm.run_dev_recog_step(recog_args=recog_args, extra_name="_lm-wei", report_args=report_args_wei_lm) | |
# same lexicon as in wei's setup, results indicate that this is not better (less del, but overall slightly worse) | |
ctc_nn_system_wei_lex = copy.deepcopy(ctc_nn_system) | |
for train_name in list(ctc_nn_system_wei_lex.returnn_configs.keys()): | |
if train_name not in ["conformer_bs10k_lgm80_conf-wei2", "conformer_bs10k_lgm80_conf-wei_old-lr-4e-4"]: | |
# only use some trainings | |
ctc_nn_system_wei_lex.returnn_configs.pop(train_name) | |
report_args_wei_lex = copy.deepcopy(report_args_collection) | |
for name in report_args_wei_lex: | |
report_args_wei_lex[name]["lex"] = "wei" | |
wei_lex = tk.Path( | |
"/u/vieting/setups/swb/20230406_feat/dependencies/lexicon_wei_blank.xml", | |
hash_overwrite="WEI_SWB_LEX", | |
cached=True, | |
) | |
ctc_nn_system_wei_lex.corpus_data["hub5e00"].lexicon["filename"] = wei_lex | |
ctc_nn_system_wei_lex.run_dev_recog_step( | |
recog_args=recog_args, extra_name="_lex-wei", report_args=report_args_wei_lex | |
) | |
# longer training to compensate for fewer steps per epoch | |
feature_args_wave_norm = { | |
"class": "LogMelNetwork", | |
"wave_norm": True, | |
"frame_size": 200, | |
"frame_shift": 80, | |
"fft_size": 256, | |
} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"lgm80_conf-wei-oldspecaug-e450v1": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={"architecture": "conf-wei", "lr": "wei_peak_4e-4_e450_cycle360", "specaug": "wei"}, | |
), | |
"lgm80_conf-wei-oldspecaug2-e450v1": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
}, | |
), | |
"lgm80_conf-wei-oldspecaug2-e450v2": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 160, | |
"decrease_epochs": 160, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_320cycle", | |
"specaug": "wei_adapt_80dim", | |
}, | |
), | |
"lgm80_conf-wei-oldspecaug2-e450v3": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 200, | |
"decrease_epochs": 200, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_400_cycle", | |
"specaug": "wei_adapt_80dim", | |
}, | |
), | |
"lgm80_conf-wei-oldspecaug2-e450v1-wavenorm": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args_wave_norm, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs10k_", | |
) | |
returnn_configs = {} | |
for exp in nn_args.returnn_training_configs: | |
prior_config = copy.deepcopy(nn_args.returnn_training_configs[exp]) | |
prior_config.config["batch_size"] = prior_config.config["batch_size"]["data"] | |
assert isinstance(prior_config.config["batch_size"], int) | |
returnn_configs[exp] = ReturnnConfigs( | |
train_config=nn_args.returnn_training_configs[exp], | |
prior_config=prior_config, | |
recog_configs={"recog": nn_args.returnn_recognition_configs[exp]}, | |
) | |
recog_args_e450 = copy.deepcopy(recog_args) | |
recog_args_e450["epochs"] = [300, 400, 450, "best"] | |
ctc_nn_system_e450 = copy.deepcopy(ctc_nn_system) | |
ctc_nn_system_e450.returnn_configs = returnn_configs | |
ctc_nn_system_e450.run_train_step(nn_args.training_args) | |
ctc_nn_system_e450.run_dev_recog_step(recog_args=recog_args_e450, report_args=report_args_collection) | |
report = Report.merge_reports( | |
[ | |
ctc_nn_system.report, | |
ctc_nn_system_e450.report, | |
ctc_nn_system_blank_penalty.report, | |
ctc_nn_system_wei_lm.report, | |
ctc_nn_system_wei_lex.report, | |
] | |
) | |
report.delete_redundant_columns() | |
report.delete_redundant_rows() | |
tk.register_report( | |
os.path.join(gs.ALIAS_AND_OUTPUT_SUBDIR, "report.csv"), | |
values=report.get_values(), | |
template=report.get_template(), | |
) | |
def run_nn_args(nn_args, report_args_collection, report_name, dev_corpora): | |
returnn_configs = {} | |
for exp in nn_args.returnn_training_configs: | |
prior_config = copy.deepcopy(nn_args.returnn_training_configs[exp]) | |
prior_config.config["batch_size"] = prior_config.config["batch_size"]["data"] | |
assert isinstance(prior_config.config["batch_size"], int) | |
returnn_configs[exp] = ReturnnConfigs( | |
train_config=nn_args.returnn_training_configs[exp], | |
prior_config=prior_config, | |
recog_configs={"recog": nn_args.returnn_recognition_configs[exp]}, | |
) | |
recog_args = { | |
"lm_scales": [0.7], | |
"prior_scales": [0.3, 0.5], | |
"epochs": [300, 400, 450, "best"], | |
"lookahead_options": {"lm_lookahead_scale": 0.7}, | |
"label_scorer_args": { | |
"use_prior": True, | |
"extra_args": {"blank_label_index": 0}, | |
}, | |
"label_tree_args": {"skip_silence": True}, | |
"search_parameters": { | |
"allow-blank-label": True, | |
"allow-label-loop": True, | |
"allow-label-recombination": True, | |
"allow-word-end-recombination": True, | |
"create-lattice": True, | |
"label-pruning": 11.2, | |
"label-pruning-limit": 100000, | |
"word-end-pruning": 0.5, | |
"word-end-pruning-limit": 10000, | |
}, | |
} | |
score_info = ScorerInfo() | |
score_info.ref_file = dev_corpora["hub5e00"].stm | |
score_info.job_type = Hub5ScoreJob | |
score_info.score_kwargs = {"glm": dev_corpora["hub5e00"].glm, "sctk_binary_path": SCTK_BINARY_PATH} | |
ctc_nn_system = TransducerSystem( | |
returnn_root=RETURNN_ROOT, | |
returnn_python_exe=RETURNN_EXE, | |
rasr_binary_path=RASR_BINARY_PATH, | |
require_native_lstm=False, | |
) | |
ctc_nn_system.init_system( | |
returnn_configs=returnn_configs, | |
dev_keys=["hub5e00"], | |
corpus_data=dev_corpora, | |
am_args={ | |
"state_tying": "monophone", | |
"states_per_phone": 1, | |
"tdp_transition": (0, 0, 0, 0), | |
"tdp_silence": (0, 0, 0, 0), | |
"phon_history_length": 0, | |
"phon_future_length": 0, | |
}, | |
scorer_info=score_info, | |
report=Report( | |
columns_start=["train_name"], | |
columns_end=["lm_scale", "prior_scale", "sub", "del", "ins", "wer"], | |
), | |
) | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_from_lexicon = False | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_all = True | |
ctc_nn_system.crp["hub5e00"].acoustic_model_config.allophones.add_from_file = tk.Path( | |
"/u/vieting/setups/swb/20230406_feat/dependencies/allophones_blank", | |
hash_overwrite="SWB_ALLOPHONE_FILE_WEI_BLANK", | |
cached=True, | |
) | |
ctc_nn_system.run_train_step(nn_args.training_args) | |
ctc_nn_system.run_dev_recog_step(recog_args=recog_args, report_args=report_args_collection) | |
assert ctc_nn_system.report is not None | |
report = ctc_nn_system.report | |
report.delete_redundant_columns() | |
report.delete_redundant_rows() | |
tk.register_report( | |
os.path.join(gs.ALIAS_AND_OUTPUT_SUBDIR, report_name), | |
values=report.get_values(), | |
template=report.get_template(), | |
) | |
def run_mel_baseline(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
} | |
feature_args = {"class": "LogMelNetwork", "wave_norm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"lgm80_baseline": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs5k_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_mel_5K_baseline.csv", dev_corpora) | |
def run_scf_baseline(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
} | |
feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"scf_baseline": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs5k_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_scf_baseline.csv", dev_corpora) | |
def run_scf_baseline_big(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
"extra_args": { | |
"watch_memory": True, | |
}, | |
} | |
feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"scf_baseline_big": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs5k_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_scf_baseline_big.csv", dev_corpora) | |
def run_mel_audio_perturbation(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets(pre_process=CodeWrapper("audio_perturb_runner.run")) | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
"audio_perturbation": True, | |
} | |
nn_base_args = {} | |
feature_args = {"class": "LogMelNetwork", "wave_norm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} | |
speeds = [ | |
{"prob": 0.6, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.6, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.6, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.5, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.5, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.5, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.4, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.4, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.4, "minimum": 0.7, "maximum": 1.3}, | |
] | |
tempos = [ | |
{"prob": 0.4, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.4, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.4, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.5, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.5, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.5, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.6, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.6, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.6, "minimum": 0.7, "maximum": 1.3}, | |
] | |
preemphases = [ | |
{"prob": 0.9, "minimum": 0.9, "maximum": 1.0}, | |
{"prob": 0.9, "minimum": 0.8, "maximum": 1.0}, | |
{"prob": 0.8, "minimum": 0.9, "maximum": 1.0}, | |
{"prob": 0.8, "minimum": 0.8, "maximum": 1.0}, | |
] | |
codecs = [ | |
{"encoding": "ULAW", "prob": 0.4}, | |
{"encoding": "ULAW", "prob": 0.6}, | |
] | |
non_linearities = [ | |
{"prob": 0.4, "alpha": 0.2}, | |
{"prob": 0.4, "alpha": 0.4}, | |
{"prob": 0.6, "alpha": 0.2}, | |
{"prob": 0.6, "alpha": 0.4}, | |
] | |
for speed in speeds: | |
key_suffix = [] | |
#build key suffix | |
key_suffix.append("speed"+ f"{speed['prob']}_{speed['minimum']}_{speed['maximum']}") | |
key = f"log_mel_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["speed"] = speed | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["speed"] = f"{speed['prob']}_{speed['minimum']}_{speed['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for tempo in tempos: | |
key_suffix = [] | |
key_suffix.append("tempo"+ f"{tempo['prob']}_{tempo['minimum']}_{tempo['maximum']}") | |
key = f"log_mel_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["tempo"] = tempo | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["tempo"] = f"{tempo['prob']}_{tempo['minimum']}_{tempo['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for preemphasis in preemphases: | |
key_suffix = [] | |
key_suffix.append("preemphasis"+ f"{preemphasis['prob']}_{preemphasis['minimum']}_{preemphasis['maximum']}") | |
key = f"log_mel_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["preemphasis"] = preemphasis | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args[ | |
"preemphasis" | |
] = f"{preemphasis['prob']}_{preemphasis['minimum']}_{preemphasis['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for codec in codecs: | |
key_suffix = [] | |
key_suffix.append("codec" + f"wav_{codec['encoding'].lower()}_{codec['prob']}") | |
key = f"log_mel_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["codecs"] = [codec] | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["codec"] = f"wav_{codec['encoding'].lower()}_{codec['prob']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for non_linearity in non_linearities: | |
key_suffix = [] | |
key_suffix.append("non_linearity" + f"{non_linearity['prob']}_{non_linearity['alpha']}") | |
key = f"log_mel_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["non_linearities"] = [non_linearity] | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["non_linearity"] = f"{non_linearity['prob']}_{non_linearity['alpha']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args=nn_base_args, | |
num_epochs=450, | |
prefix="conformer_bs5k_audio_perturbation_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_log_mel_audio_perturbation.csv", dev_corpora) | |
def run_scf_audio_perturbation(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets(pre_process=CodeWrapper("audio_perturb_runner.run")) | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
"audio_perturbation": True, | |
} | |
nn_base_args = {} | |
feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2} | |
speeds = [ | |
{"prob": 0.6, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.6, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.6, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.5, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.5, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.5, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.4, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.4, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.4, "minimum": 0.7, "maximum": 1.3}, | |
] | |
tempos = [ | |
{"prob": 0.4, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.4, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.4, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.5, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.5, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.5, "minimum": 0.7, "maximum": 1.3}, | |
{"prob": 0.6, "minimum": 0.9, "maximum": 1.1}, | |
{"prob": 0.6, "minimum": 0.8, "maximum": 1.2}, | |
{"prob": 0.6, "minimum": 0.7, "maximum": 1.3}, | |
] | |
preemphases = [ | |
{"prob": 0.9, "minimum": 0.9, "maximum": 1.0}, | |
{"prob": 0.9, "minimum": 0.8, "maximum": 1.0}, | |
{"prob": 0.8, "minimum": 0.9, "maximum": 1.0}, | |
{"prob": 0.8, "minimum": 0.8, "maximum": 1.0}, | |
] | |
codecs = [ | |
{"encoding": "ULAW", "prob": 0.4}, | |
{"encoding": "ULAW", "prob": 0.6}, | |
] | |
non_linearities = [ | |
{"prob": 0.4, "alpha": 0.2}, | |
{"prob": 0.4, "alpha": 0.4}, | |
{"prob": 0.6, "alpha": 0.2}, | |
{"prob": 0.6, "alpha": 0.4}, | |
] | |
for speed in speeds: | |
key_suffix = [] | |
#build key suffix | |
key_suffix.append("speed"+ f"{speed['prob']}_{speed['minimum']}_{speed['maximum']}") | |
key = f"scf_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["speed"] = speed | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["speed"] = f"{speed['prob']}_{speed['minimum']}_{speed['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for tempo in tempos: | |
key_suffix = [] | |
key_suffix.append("tempo"+ f"{tempo['prob']}_{tempo['minimum']}_{tempo['maximum']}") | |
key = f"scf_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["tempo"] = tempo | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["tempo"] = f"{tempo['prob']}_{tempo['minimum']}_{tempo['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for preemphasis in preemphases: | |
key_suffix = [] | |
key_suffix.append("preemphasis"+ f"{preemphasis['prob']}_{preemphasis['minimum']}_{preemphasis['maximum']}") | |
key = f"scf_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["preemphasis"] = preemphasis | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args[ | |
"preemphasis" | |
] = f"{preemphasis['prob']}_{preemphasis['minimum']}_{preemphasis['maximum']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for codec in codecs: | |
key_suffix = [] | |
key_suffix.append("codec" + f"wav_{codec['encoding'].lower()}_{codec['prob']}") | |
key = f"scf_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["codecs"] = [codec] | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["codec"] = f"wav_{codec['encoding'].lower()}_{codec['prob']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
for non_linearity in non_linearities: | |
key_suffix = [] | |
key_suffix.append("non_linearity" + f"{non_linearity['prob']}_{non_linearity['alpha']}") | |
key = f"scf_conf-wei-oldspecaug-audio_perturbation_{'_'.join(key_suffix)}" | |
audio_perturb_args = {} | |
audio_perturb_args["non_linearities"] = [non_linearity] | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
} | |
report_args["non_linearity"] = f"{non_linearity['prob']}_{non_linearity['alpha']}" | |
nn_base_args[key] = dict( | |
returnn_args={ | |
"extra_args": { | |
"audio_perturb_args": audio_perturb_args, | |
"audio_perturb_runner": CodeWrapper("WaveformPerturbation(**audio_perturb_args)"), | |
}, | |
**returnn_args,}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args=report_args, | |
) | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args=nn_base_args, | |
num_epochs=450, | |
prefix="conformer_bs5k_audio_perturbation_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_scf_audio_perturbation.csv", dev_corpora) | |
def run_scf_watch_mem_test(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
"extra_args": { | |
"watch_memory": True, | |
}, | |
} | |
feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"scf_watch_mem_test": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs5k_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_watch_mem_test.csv", dev_corpora) | |
def run_mel_watch_mem_test(): | |
gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" | |
( | |
returnn_datasets, | |
rasr_loss_corpus_path, | |
rasr_loss_corpus_segments, | |
rasr_loss_lexicon_path, | |
dev_corpora, | |
) = get_datasets() | |
returnn_args = { | |
"batch_size": 5000, | |
"rasr_binary_path": RASR_BINARY_PATH, | |
"rasr_loss_corpus_path": rasr_loss_corpus_path, | |
"rasr_loss_corpus_segments": rasr_loss_corpus_segments, | |
"rasr_loss_lexicon_path": rasr_loss_lexicon_path, | |
"datasets": returnn_datasets, | |
"extra_args": { | |
"watch_memory": True, | |
}, | |
} | |
feature_args = {"class": "LogMelNetwork", "wave_norm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} | |
nn_args, report_args_collection = get_nn_args_baseline( | |
nn_base_args={ | |
"lgm80__watch_mem_test": dict( | |
returnn_args={"conformer_type": "wei", "specaug_old": {"max_feature": 8}, **returnn_args}, | |
feature_args=feature_args, | |
lr_args={ | |
"peak_lr": 4e-4, | |
"start_lr": 1.325e-05, | |
"end_lr": 1e-5, | |
"increase_epochs": 180, | |
"decrease_epochs": 180, | |
"final_epochs": 0, | |
}, | |
report_args={ | |
"architecture": "conf-wei", | |
"lr": "wei_peak_4e-4_e450_cycle360", | |
"specaug": "wei_adapt_80dim", | |
"wave_norm": "True", | |
}, | |
), | |
}, | |
num_epochs=450, | |
prefix="conformer_bs5k_", | |
) | |
run_nn_args(nn_args, report_args_collection, "report_mel_watch_mem_test", dev_corpora) | |
def py(): | |
""" | |
called if the file is passed to sis manager, used to run all experiments (replacement for main) | |
""" | |
run_mel_baseline() | |
run_scf_baseline() | |
run_mel_audio_perturbation() | |
run_scf_audio_perturbation() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment