-
-
Save cwbeitel/00f0bbc5acf5a275d5b231aeb49fba1f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"activation_dtype": "float32", | |
"add_relative_to_values": false, | |
"attention_dropout": 0.1, | |
"attention_dropout_broadcast_dims": "", | |
"attention_key_channels": 0, | |
"attention_value_channels": 0, | |
"attention_variables_3d": false, | |
"batch_shuffle_size": 512, | |
"batch_size": 4, | |
"causal_decoder_self_attention": true, | |
"clip_grad_norm": 0.0, | |
"compress_steps": 0, | |
"conv_first_kernel": 3, | |
"daisy_chain_variables": true, | |
"data_dir": "/tmp", | |
"dropout": 0.2, | |
"eval_drop_long_sequences": false, | |
"eval_freq_in_steps": 1000, | |
"eval_run_autoregressive": false, | |
"eval_steps": 100, | |
"eval_timeout_mins": 240, | |
"factored_logits": false, | |
"ffn_layer": "dense_relu_dense", | |
"filter_size": 512, | |
"force_full_predict": false, | |
"grad_noise_scale": 0.0, | |
"heads_share_relative_embedding": false, | |
"hidden_size": 128, | |
"initializer": "uniform_unit_scaling", | |
"initializer_gain": 1.0, | |
"kernel_height": 3, | |
"kernel_width": 1, | |
"label_smoothing": 0.1, | |
"layer_postprocess_sequence": "da", | |
"layer_prepostprocess_dropout": 0.1, | |
"layer_prepostprocess_dropout_broadcast_dims": "", | |
"layer_preprocess_sequence": "n", | |
"learning_rate": 0.2, | |
"learning_rate_constant": 2.0, | |
"learning_rate_cosine_cycle_steps": 250000, | |
"learning_rate_decay_rate": 1.0, | |
"learning_rate_decay_scheme": "noam", | |
"learning_rate_decay_staircase": false, | |
"learning_rate_decay_steps": 5000, | |
"learning_rate_minimum": null, | |
"learning_rate_schedule": "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size", | |
"learning_rate_warmup_steps": 8000, | |
"length_bucket_step": 1.1, | |
"max_input_seq_length": 0, | |
"max_length": 256, | |
"max_relative_position": 0, | |
"max_target_seq_length": 0, | |
"min_length": 0, | |
"min_length_bucket": 8, | |
"mixed_precision_optimizer_init_loss_scale": 32768, | |
"mixed_precision_optimizer_loss_scaler": "exponential", | |
"mlperf_mode": false, | |
"model_dir": "gs://some-bucket-models/experiments/j0227-0154-add2/output", | |
"moe_hidden_sizes": "2048", | |
"moe_k": 2, | |
"moe_loss_coef": 0.001, | |
"moe_num_experts": 16, | |
"moe_overhead_eval": 2.0, | |
"moe_overhead_train": 1.0, | |
"multiply_embedding_mode": "sqrt_depth", | |
"multiproblem_fixed_train_length": -1, | |
"multiproblem_label_weight": 0.5, | |
"multiproblem_max_input_length": -1, | |
"multiproblem_max_target_length": -1, | |
"multiproblem_mixing_schedule": "constant", | |
"multiproblem_per_task_threshold": "", | |
"multiproblem_reweight_label_loss": false, | |
"multiproblem_schedule_max_examples": 10000000.0, | |
"multiproblem_schedule_threshold": 0.5, | |
"multiproblem_target_eval_only": false, | |
"multiproblem_vocab_size": -1, | |
"nbr_decoder_problems": 1, | |
"no_data_parallelism": false, | |
"norm_epsilon": 1e-06, | |
"norm_type": "layer", | |
"num_decoder_layers": 0, | |
"num_encoder_layers": 0, | |
"num_heads": 4, | |
"num_hidden_layers": 2, | |
"optimizer": "adam", | |
"optimizer_adafactor_beta1": 0.0, | |
"optimizer_adafactor_beta2": 0.999, | |
"optimizer_adafactor_clipping_threshold": 1.0, | |
"optimizer_adafactor_decay_type": "pow", | |
"optimizer_adafactor_factored": true, | |
"optimizer_adafactor_memory_exponent": 0.8, | |
"optimizer_adafactor_multiply_by_parameter_scale": true, | |
"optimizer_adam_beta1": 0.9, | |
"optimizer_adam_beta2": 0.997, | |
"optimizer_adam_epsilon": 1e-09, | |
"optimizer_momentum_momentum": 0.9, | |
"optimizer_momentum_nesterov": false, | |
"optimizer_multistep_accumulate_steps": null, | |
"optimizer_zero_grads": false, | |
"overload_eval_metric_name": "", | |
"pack_dataset": false, | |
"pad_batch": false, | |
"parameter_attention_key_channels": 0, | |
"parameter_attention_value_channels": 0, | |
"pos": "timing", | |
"prepend_mode": "none", | |
"pretrained_model_dir": "", | |
"proximity_bias": false, | |
"relu_dropout": 0.1, | |
"relu_dropout_broadcast_dims": "", | |
"sampling_method": "argmax", | |
"sampling_temp": 1.0, | |
"schedule": "train", | |
"scheduled_sampling_gold_mixin_prob": 0.5, | |
"scheduled_sampling_prob": 0.0, | |
"scheduled_sampling_warmup_steps": 50000, | |
"self_attention_type": "dot_product", | |
"shared_embedding": false, | |
"shared_embedding_and_softmax_weights": true, | |
"split_targets_chunk_length": 0, | |
"split_targets_max_chunks": 100, | |
"split_to_length": 0, | |
"std_server_protocol": "grpc", | |
"summarize_grads": false, | |
"summarize_vars": false, | |
"symbol_dropout": 0.0, | |
"symbol_modality_num_shards": 16, | |
"tpu_enable_host_call": false, | |
"train_steps": 100, | |
"unidirectional_encoder": false, | |
"use_custom_ops": true, | |
"use_fixed_batch_size": false, | |
"use_pad_remover": true, | |
"use_target_space_embedding": true, | |
"video_num_input_frames": 1, | |
"video_num_target_frames": 1, | |
"vocab_divisor": 1, | |
"warm_start_from": null, | |
"warm_start_from_second": "", | |
"weight_decay": 0.0, | |
"weight_dtype": "float32", | |
"weight_noise": 0.0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment