Created
November 3, 2020 18:55
-
-
Save danyaljj/9d8674f6a812844c46da498d86501dff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import t5.models.mesh_transformer | |
import t5.data.sentencepiece_vocabulary | |
import mesh_tensorflow.optimize | |
import mesh_tensorflow.transformer.dataset | |
import mesh_tensorflow.transformer.learning_rate_schedules | |
import mesh_tensorflow.transformer.t2t_vocabulary | |
import mesh_tensorflow.transformer.transformer_layers | |
import mesh_tensorflow.transformer.utils | |
# Macros: | |
# ============================================================================== | |
teacher_d_ff = 65536 | |
teacher_d_kv = 128 | |
teacher_d_model = 1024 | |
teacher_num_heads = 128 | |
teacher_num_layers = 24 | |
teacher_init_checkpoint = "gs://t5-data/pretrained_models/11B/model.ckpt-1000000" | |
student_d_ff = 2048 | |
student_d_kv = 64 | |
student_d_model = 512 | |
student_num_heads = 8 | |
student_num_layers = 6 | |
student_init_checkpoint = "gs://t5-data/pretrained_models/small/model.ckpt-1000000" | |
dropout_rate = 0.1 | |
inputs_length = 512 | |
targets_length = 512 | |
mean_noise_span_length = 3.0 | |
MIXTURE_NAME = 'all_mix' | |
noise_density = 0.15 | |
tokens_per_batch = 1048576 | |
# Parameters for AdafactorOptimizer: | |
# ============================================================================== | |
AdafactorOptimizer.beta1 = 0.0 | |
AdafactorOptimizer.clipping_threshold = 1.0 | |
AdafactorOptimizer.decay_rate = None | |
AdafactorOptimizer.epsilon1 = 1e-30 | |
AdafactorOptimizer.epsilon2 = 0.001 | |
AdafactorOptimizer.factored = True | |
AdafactorOptimizer.min_dim_size_to_factor = 128 | |
AdafactorOptimizer.multiply_by_parameter_scale = True | |
# Parameters for Bitransformer: | |
# ============================================================================== | |
Bitransformer.shared_embedding = True | |
# Parameters for denoise: | |
# ============================================================================== | |
denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel | |
denoise.noise_density = %noise_density | |
denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask | |
denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel | |
# Parameters for teacher/decoder/DenseReluDense: | |
# ============================================================================== | |
teacher/decoder/DenseReluDense.dropout_rate = %dropout_rate | |
teacher/decoder/DenseReluDense.hidden_size = %teacher_d_ff | |
# Parameters for teacher/encoder/DenseReluDense: | |
# ============================================================================== | |
teacher/encoder/DenseReluDense.dropout_rate = %dropout_rate | |
teacher/encoder/DenseReluDense.hidden_size = %teacher_d_ff | |
# Parameters for teacher/decoder/EncDecAttention: | |
# ============================================================================== | |
# None. | |
# Parameters for get_sentencepiece_model_path: | |
# ============================================================================== | |
get_sentencepiece_model_path.mixture_or_task_name = %MIXTURE_NAME | |
# Parameters for get_variable_dtype: | |
# ============================================================================== | |
get_variable_dtype.activation_dtype = 'bfloat16' | |
# Parameters for teacher/decoder/LayerStack: | |
# ============================================================================== | |
teacher/decoder/LayerStack.dropout_rate = %dropout_rate | |
teacher/decoder/LayerStack.norm_epsilon = 1e-06 | |
# Parameters for teacher/encoder/LayerStack: | |
# ============================================================================== | |
teacher/encoder/LayerStack.dropout_rate = %dropout_rate | |
teacher/encoder/LayerStack.norm_epsilon = 1e-06 | |
# Parameters for learning_rate_schedule_noam: | |
# ============================================================================== | |
learning_rate_schedule_noam.linear_decay_fraction = 0.1 | |
learning_rate_schedule_noam.multiplier = 1.0 | |
learning_rate_schedule_noam.offset = 0 | |
learning_rate_schedule_noam.warmup_steps = 10000 | |
# Parameters for make_bitransformer: | |
# ============================================================================== | |
make_bitransformer.decoder_name = 'decoder' | |
make_bitransformer.encoder_name = 'encoder' | |
# Parameters for teacher/decoder/make_layer_stack: | |
# ============================================================================== | |
teacher/decoder/make_layer_stack.block_scope = True | |
teacher/decoder/make_layer_stack.layers = \ | |
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | |
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention, | |
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | |
teacher/decoder/make_layer_stack.num_layers = %teacher_num_layers | |
# Parameters for teacher/encoder/make_layer_stack: | |
# ============================================================================== | |
teacher/encoder/make_layer_stack.block_scope = True | |
teacher/encoder/make_layer_stack.layers = \ | |
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | |
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | |
teacher/encoder/make_layer_stack.num_layers = %teacher_num_layers | |
# Parameters for mesh_train_dataset_fn: | |
# ============================================================================== | |
mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME | |
mesh_train_dataset_fn.use_cached = False | |
# Parameters for noise_span_to_unique_sentinel: | |
# ============================================================================== | |
# None. | |
# Parameters for nonnoise_span_to_unique_sentinel: | |
# ============================================================================== | |
# None. | |
# Parameters for pack_dataset: | |
# ============================================================================== | |
# None | |
# Parameters for pack_or_pad: | |
# ============================================================================== | |
# None. | |
# Parameters for random_spans_helper: | |
# ============================================================================== | |
random_spans_helper.extra_tokens_per_span_inputs = 1 | |
random_spans_helper.extra_tokens_per_span_targets = 1 | |
random_spans_helper.inputs_length = %inputs_length | |
random_spans_helper.mean_noise_span_length = %mean_noise_span_length | |
random_spans_helper.noise_density = %noise_density | |
# Parameters for teacher/decoder/SelfAttention: | |
# ============================================================================== | |
teacher/decoder/SelfAttention.attention_kwargs = None | |
teacher/decoder/SelfAttention.dropout_rate = %dropout_rate | |
teacher/decoder/SelfAttention.key_value_size = %teacher_d_kv | |
teacher/decoder/SelfAttention.num_heads = %teacher_num_heads | |
teacher/decoder/SelfAttention.num_memory_heads = 0 | |
teacher/decoder/SelfAttention.relative_attention_num_buckets = 32 | |
teacher/decoder/SelfAttention.relative_attention_type = 'bias_shared' | |
teacher/decoder/SelfAttention.shared_kv = False | |
# Parameters for teacher/encoder/SelfAttention: | |
# ============================================================================== | |
teacher/encoder/SelfAttention.attention_kwargs = None | |
teacher/encoder/SelfAttention.dropout_rate = %dropout_rate | |
teacher/encoder/SelfAttention.key_value_size = %teacher_d_kv | |
teacher/encoder/SelfAttention.num_heads = %teacher_num_heads | |
teacher/encoder/SelfAttention.num_memory_heads = 0 | |
teacher/encoder/SelfAttention.relative_attention_num_buckets = 32 | |
teacher/encoder/SelfAttention.relative_attention_type = 'bias_shared' | |
teacher/encoder/SelfAttention.shared_kv = False | |
# Parameters for teacher/decoder/Unitransformer: | |
# ============================================================================== | |
teacher/decoder/Unitransformer.d_model = %teacher_d_model | |
teacher/decoder/Unitransformer.input_full_attention = False | |
teacher/decoder/Unitransformer.label_smoothing = 0.0 | |
teacher/decoder/Unitransformer.loss_fn = None | |
teacher/decoder/Unitransformer.loss_on_targets_only = False | |
teacher/decoder/Unitransformer.max_length = 512 | |
teacher/decoder/Unitransformer.positional_embedding = False | |
teacher/decoder/Unitransformer.shared_embedding_and_softmax_weights = True | |
teacher/decoder/Unitransformer.vocab_divisor = 128 | |
teacher/decoder/Unitransformer.z_loss = 0.0001 | |
teacher/decoder/Unitransformer.loss_denominator = 233472 | |
# Parameters for teacher/encoder/Unitransformer: | |
# ============================================================================== | |
teacher/encoder/Unitransformer.d_model = %teacher_d_model | |
teacher/encoder/Unitransformer.input_full_attention = False | |
teacher/encoder/Unitransformer.label_smoothing = 0.0 | |
teacher/encoder/Unitransformer.loss_fn = None | |
teacher/encoder/Unitransformer.loss_on_targets_only = False | |
teacher/encoder/Unitransformer.max_length = 512 | |
teacher/encoder/Unitransformer.positional_embedding = False | |
teacher/encoder/Unitransformer.shared_embedding_and_softmax_weights = True | |
teacher/encoder/Unitransformer.vocab_divisor = 128 | |
teacher/encoder/Unitransformer.z_loss = 0.0001 | |
# Parameters for unsupervised: | |
# ============================================================================== | |
unsupervised.preprocessors = \ | |
[@preprocessors.select_random_chunk, | |
@preprocessors.reduce_concat_tokens, | |
@preprocessors.split_tokens, | |
@preprocessors.denoise] | |
# Parameters for student/decoder/DenseReluDense: | |
# ============================================================================== | |
student/decoder/DenseReluDense.dropout_rate = %dropout_rate | |
student/decoder/DenseReluDense.hidden_size = %student_d_ff | |
# Parameters for student/encoder/DenseReluDense: | |
# ============================================================================== | |
student/encoder/DenseReluDense.dropout_rate = %dropout_rate | |
student/encoder/DenseReluDense.hidden_size = %student_d_ff | |
# Parameters for student/decoder/EncDecAttention: | |
# ============================================================================== | |
# None. | |
# Parameters for get_sentencepiece_model_path: | |
# ============================================================================== | |
get_sentencepiece_model_path.mixture_or_task_name = %MIXTURE_NAME | |
# Parameters for get_variable_dtype: | |
# ============================================================================== | |
get_variable_dtype.activation_dtype = 'bfloat16' | |
# Parameters for student/decoder/LayerStack: | |
# ============================================================================== | |
student/decoder/LayerStack.dropout_rate = %dropout_rate | |
student/decoder/LayerStack.norm_epsilon = 1e-06 | |
# Parameters for student/encoder/LayerStack: | |
# ============================================================================== | |
student/encoder/LayerStack.dropout_rate = %dropout_rate | |
student/encoder/LayerStack.norm_epsilon = 1e-06 | |
# Parameters for student/decoder/make_layer_stack: | |
# ============================================================================== | |
student/decoder/make_layer_stack.block_scope = True | |
student/decoder/make_layer_stack.layers = \ | |
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | |
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention, | |
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | |
student/decoder/make_layer_stack.num_layers = %student_num_layers | |
# Parameters for student/encoder/make_layer_stack: | |
# ============================================================================== | |
student/encoder/make_layer_stack.block_scope = True | |
student/encoder/make_layer_stack.layers = \ | |
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | |
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | |
student/encoder/make_layer_stack.num_layers = %student_num_layers | |
# Parameters for targets_length/random_spans_helper: | |
# ============================================================================== | |
targets_length/random_spans_helper.extra_tokens_per_span_inputs = 1 | |
targets_length/random_spans_helper.extra_tokens_per_span_targets = 1 | |
targets_length/random_spans_helper.inputs_length = %inputs_length | |
targets_length/random_spans_helper.mean_noise_span_length = %mean_noise_span_length | |
targets_length/random_spans_helper.noise_density = %noise_density | |
# Parameters for random_spans_noise_mask: | |
# ============================================================================== | |
random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length | |
# Parameters for targets_length/random_spans_targets_length: | |
# ============================================================================== | |
# None. | |
# Parameters for random_spans_tokens_length: | |
# ============================================================================== | |
# None. | |
# Parameters for rate_num_examples: | |
# ============================================================================== | |
rate_num_examples.maximum = 1000000.0 | |
rate_num_examples.scale = 1.0 | |
rate_num_examples.temperature = 1.0 | |
# Parameters for rate_unsupervised: | |
# ============================================================================== | |
rate_unsupervised.value = 710000.0 | |
# Parameters for reduce_concat_tokens: | |
# ============================================================================== | |
reduce_concat_tokens.batch_size = 128 | |
reduce_concat_tokens.feature_key = 'targets' | |
# Parameters for run: | |
# ============================================================================== | |
run.autostack = True | |
run.batch_size = ('tokens_per_batch', %tokens_per_batch) | |
run.dataset_split = 'train' | |
run.ensemble_inputs = None | |
run.eval_checkpoint_step = None | |
run.eval_dataset_fn = None | |
run.eval_summary_dir = None | |
run.export_path = '' | |
run.iterations_per_loop = 100 | |
run.keep_checkpoint_max = None | |
run.layout_rules = \ | |
'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch' | |
run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam | |
run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape() | |
run.mode = 'train' | |
run.init_checkpoint = %student_init_checkpoint | |
run.model_type = 'bi_student_teacher' | |
run.optimizer = @optimize.AdafactorOptimizer | |
run.perplexity_eval_steps = 10 | |
run.predict_fn = None | |
run.save_checkpoints_steps = 2400 | |
run.sequence_length = {'inputs': %inputs_length, 'targets': %targets_length} | |
run.train_dataset_fn = \ | |
@t5.models.mesh_transformer.mesh_train_dataset_fn | |
run.train_steps = 1000000000 | |
run.variable_filter = None | |
run.vocabulary = \ | |
@t5.data.sentencepiece_vocabulary.SentencePieceVocabulary() | |
# Parameters for StudentTeacher: | |
# ============================================================================== | |
StudentTeacher.teacher_checkpoint = %teacher_init_checkpoint | |
StudentTeacher.temperature = 10 | |
StudentTeacher.fraction_soft = 0.6 | |
# Parameters for select_random_chunk: | |
# ============================================================================== | |
select_random_chunk.feature_key = 'targets' | |
select_random_chunk.max_length = 65536 | |
# Parameters for student/decoder/SelfAttention: | |
# ============================================================================== | |
student/decoder/SelfAttention.attention_kwargs = None | |
student/decoder/SelfAttention.dropout_rate = %dropout_rate | |
student/decoder/SelfAttention.key_value_size = %student_d_kv | |
student/decoder/SelfAttention.num_heads = %student_num_heads | |
student/decoder/SelfAttention.num_memory_heads = 0 | |
student/decoder/SelfAttention.relative_attention_num_buckets = 32 | |
student/decoder/SelfAttention.relative_attention_type = 'bias_shared' | |
student/decoder/SelfAttention.shared_kv = False | |
# Parameters for student/encoder/SelfAttention: | |
# ============================================================================== | |
student/encoder/SelfAttention.attention_kwargs = None | |
student/encoder/SelfAttention.dropout_rate = %dropout_rate | |
student/encoder/SelfAttention.key_value_size = %student_d_kv | |
student/encoder/SelfAttention.num_heads = %student_num_heads | |
student/encoder/SelfAttention.num_memory_heads = 0 | |
student/encoder/SelfAttention.relative_attention_num_buckets = 32 | |
student/encoder/SelfAttention.relative_attention_type = 'bias_shared' | |
student/encoder/SelfAttention.shared_kv = False | |
# Parameters for SentencePieceVocabulary: | |
# ============================================================================== | |
SentencePieceVocabulary.extra_ids = 100 | |
SentencePieceVocabulary.sentencepiece_model_file = \ | |
@t5.models.mesh_transformer.get_sentencepiece_model_path() | |
# Parameters for serialize_num_microbatches: | |
# ============================================================================== | |
serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192 | |
# Parameters for split_tokens: | |
# ============================================================================== | |
split_tokens.feature_key = 'targets' | |
split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length() | |
split_tokens.min_tokens_per_segment = None | |
# Parameters for tpu_estimator_model_fn: | |
# ============================================================================== | |
tpu_estimator_model_fn.outer_batch_size = 1 | |
tpu_estimator_model_fn.tpu_summaries = False | |
# Parameters for tpu_mesh_shape: | |
# ============================================================================== | |
tpu_mesh_shape.ensemble_parallelism = None | |
tpu_mesh_shape.model_parallelism = 1 | |
tpu_mesh_shape.tpu_topology = '8x8' | |
# Parameters for student/decoder/Unitransformer: | |
# ============================================================================== | |
student/decoder/Unitransformer.d_model = %d_model | |
student/decoder/Unitransformer.ensemble = None | |
student/decoder/Unitransformer.input_full_attention = False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment