Skip to content

Instantly share code, notes, and snippets.

@cwbeitel
Created March 13, 2019 14:54
Show Gist options
  • Save cwbeitel/00f0bbc5acf5a275d5b231aeb49fba1f to your computer and use it in GitHub Desktop.
Save cwbeitel/00f0bbc5acf5a275d5b231aeb49fba1f to your computer and use it in GitHub Desktop.
{
"activation_dtype": "float32",
"add_relative_to_values": false,
"attention_dropout": 0.1,
"attention_dropout_broadcast_dims": "",
"attention_key_channels": 0,
"attention_value_channels": 0,
"attention_variables_3d": false,
"batch_shuffle_size": 512,
"batch_size": 4,
"causal_decoder_self_attention": true,
"clip_grad_norm": 0.0,
"compress_steps": 0,
"conv_first_kernel": 3,
"daisy_chain_variables": true,
"data_dir": "/tmp",
"dropout": 0.2,
"eval_drop_long_sequences": false,
"eval_freq_in_steps": 1000,
"eval_run_autoregressive": false,
"eval_steps": 100,
"eval_timeout_mins": 240,
"factored_logits": false,
"ffn_layer": "dense_relu_dense",
"filter_size": 512,
"force_full_predict": false,
"grad_noise_scale": 0.0,
"heads_share_relative_embedding": false,
"hidden_size": 128,
"initializer": "uniform_unit_scaling",
"initializer_gain": 1.0,
"kernel_height": 3,
"kernel_width": 1,
"label_smoothing": 0.1,
"layer_postprocess_sequence": "da",
"layer_prepostprocess_dropout": 0.1,
"layer_prepostprocess_dropout_broadcast_dims": "",
"layer_preprocess_sequence": "n",
"learning_rate": 0.2,
"learning_rate_constant": 2.0,
"learning_rate_cosine_cycle_steps": 250000,
"learning_rate_decay_rate": 1.0,
"learning_rate_decay_scheme": "noam",
"learning_rate_decay_staircase": false,
"learning_rate_decay_steps": 5000,
"learning_rate_minimum": null,
"learning_rate_schedule": "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size",
"learning_rate_warmup_steps": 8000,
"length_bucket_step": 1.1,
"max_input_seq_length": 0,
"max_length": 256,
"max_relative_position": 0,
"max_target_seq_length": 0,
"min_length": 0,
"min_length_bucket": 8,
"mixed_precision_optimizer_init_loss_scale": 32768,
"mixed_precision_optimizer_loss_scaler": "exponential",
"mlperf_mode": false,
"model_dir": "gs://some-bucket-models/experiments/j0227-0154-add2/output",
"moe_hidden_sizes": "2048",
"moe_k": 2,
"moe_loss_coef": 0.001,
"moe_num_experts": 16,
"moe_overhead_eval": 2.0,
"moe_overhead_train": 1.0,
"multiply_embedding_mode": "sqrt_depth",
"multiproblem_fixed_train_length": -1,
"multiproblem_label_weight": 0.5,
"multiproblem_max_input_length": -1,
"multiproblem_max_target_length": -1,
"multiproblem_mixing_schedule": "constant",
"multiproblem_per_task_threshold": "",
"multiproblem_reweight_label_loss": false,
"multiproblem_schedule_max_examples": 10000000.0,
"multiproblem_schedule_threshold": 0.5,
"multiproblem_target_eval_only": false,
"multiproblem_vocab_size": -1,
"nbr_decoder_problems": 1,
"no_data_parallelism": false,
"norm_epsilon": 1e-06,
"norm_type": "layer",
"num_decoder_layers": 0,
"num_encoder_layers": 0,
"num_heads": 4,
"num_hidden_layers": 2,
"optimizer": "adam",
"optimizer_adafactor_beta1": 0.0,
"optimizer_adafactor_beta2": 0.999,
"optimizer_adafactor_clipping_threshold": 1.0,
"optimizer_adafactor_decay_type": "pow",
"optimizer_adafactor_factored": true,
"optimizer_adafactor_memory_exponent": 0.8,
"optimizer_adafactor_multiply_by_parameter_scale": true,
"optimizer_adam_beta1": 0.9,
"optimizer_adam_beta2": 0.997,
"optimizer_adam_epsilon": 1e-09,
"optimizer_momentum_momentum": 0.9,
"optimizer_momentum_nesterov": false,
"optimizer_multistep_accumulate_steps": null,
"optimizer_zero_grads": false,
"overload_eval_metric_name": "",
"pack_dataset": false,
"pad_batch": false,
"parameter_attention_key_channels": 0,
"parameter_attention_value_channels": 0,
"pos": "timing",
"prepend_mode": "none",
"pretrained_model_dir": "",
"proximity_bias": false,
"relu_dropout": 0.1,
"relu_dropout_broadcast_dims": "",
"sampling_method": "argmax",
"sampling_temp": 1.0,
"schedule": "train",
"scheduled_sampling_gold_mixin_prob": 0.5,
"scheduled_sampling_prob": 0.0,
"scheduled_sampling_warmup_steps": 50000,
"self_attention_type": "dot_product",
"shared_embedding": false,
"shared_embedding_and_softmax_weights": true,
"split_targets_chunk_length": 0,
"split_targets_max_chunks": 100,
"split_to_length": 0,
"std_server_protocol": "grpc",
"summarize_grads": false,
"summarize_vars": false,
"symbol_dropout": 0.0,
"symbol_modality_num_shards": 16,
"tpu_enable_host_call": false,
"train_steps": 100,
"unidirectional_encoder": false,
"use_custom_ops": true,
"use_fixed_batch_size": false,
"use_pad_remover": true,
"use_target_space_embedding": true,
"video_num_input_frames": 1,
"video_num_target_frames": 1,
"vocab_divisor": 1,
"warm_start_from": null,
"warm_start_from_second": "",
"weight_decay": 0.0,
"weight_dtype": "float32",
"weight_noise": 0.0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment