Created
February 2, 2021 07:10
-
-
Save pritamdamania87/7141eadd162ba672b465a7920e62508e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using world size: 1, data-parallel-size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 | |
using torch.float16 for parameters ... | |
------------------------ arguments ------------------------ | |
adam_beta1 ...................................... 0.9 | |
adam_beta2 ...................................... 0.999 | |
adam_eps ........................................ 1e-08 | |
adlr_autoresume ................................. False | |
adlr_autoresume_interval ........................ 1000 | |
apply_query_key_layer_scaling ................... True | |
apply_residual_connection_post_layernorm ........ False | |
attention_dropout ............................... 0.1 | |
attention_softmax_in_fp32 ....................... False | |
bert_load ....................................... None | |
bias_dropout_fusion ............................. True | |
bias_gelu_fusion ................................ True | |
block_data_path ................................. None | |
checkpoint_activations .......................... True | |
checkpoint_num_layers ........................... 1 | |
clip_grad ....................................... 1.0 | |
consumed_train_samples .......................... 0 | |
consumed_valid_samples .......................... 0 | |
data_impl ....................................... infer | |
data_parallel_size .............................. 1 | |
data_path ....................................... ['my-bert_text_sentence'] | |
DDP_impl ........................................ local | |
distribute_checkpointed_activations ............. False | |
distributed_backend ............................. nccl | |
eod_mask_loss ................................... False | |
eval_interval ................................... 100 | |
eval_iters ...................................... 10 | |
exit_duration_in_mins ........................... None | |
exit_interval ................................... None | |
faiss_use_gpu ................................... False | |
finetune ........................................ False | |
fp16 ............................................ True | |
fp16_lm_cross_entropy ........................... False | |
fp32_allreduce .................................. False | |
fp32_residual_connection ........................ False | |
global_batch_size ............................... 8 | |
hidden_dropout .................................. 0.1 | |
hidden_size ..................................... 1024 | |
hysteresis ...................................... 2 | |
ict_head_size ................................... None | |
ict_load ........................................ None | |
indexer_batch_size .............................. 128 | |
indexer_log_interval ............................ 1000 | |
init_method_std ................................. 0.02 | |
initial_loss_scale .............................. 4294967296 | |
layernorm_epsilon ............................... 1e-05 | |
lazy_mpu_init ................................... None | |
load ............................................ checkpoints/bert_345m | |
local_rank ...................................... None | |
log_interval .................................... 10 | |
loss_scale ...................................... None | |
loss_scale_window ............................... 1000 | |
lr .............................................. 0.0001 | |
lr_decay_iters .................................. 990000 | |
lr_decay_samples ................................ None | |
lr_decay_style .................................. linear | |
lr_warmup_fraction .............................. 0.01 | |
lr_warmup_iters ................................. 0 | |
lr_warmup_samples ............................... 0 | |
make_vocab_size_divisible_by .................... 128 | |
mask_prob ....................................... 0.15 | |
max_position_embeddings ......................... 512 | |
merge_file ...................................... None | |
micro_batch_size ................................ 4 | |
min_loss_scale .................................. 1.0 | |
min_lr .......................................... 1e-05 | |
mmap_warmup ..................................... False | |
no_load_optim ................................... False | |
no_load_rng ..................................... False | |
no_save_optim ................................... False | |
no_save_rng ..................................... False | |
num_attention_heads ............................. 16 | |
num_layers ...................................... 24 | |
num_workers ..................................... 2 | |
onnx_safe ....................................... None | |
openai_gelu ..................................... False | |
override_lr_scheduler ........................... False | |
params_dtype .................................... torch.float16 | |
pipeline_model_parallel_size .................... 1 | |
query_in_block_prob ............................. 0.1 | |
rampup_batch_size ............................... None | |
rank ............................................ 0 | |
report_topk_accuracies .......................... [] | |
reset_attention_mask ............................ False | |
reset_position_ids .............................. False | |
save ............................................ checkpoints/bert_345m | |
save_interval ................................... 500 | |
scaled_masked_softmax_fusion .................... False | |
scaled_upper_triang_masked_softmax_fusion ....... False | |
seed ............................................ 1234 | |
seq_length ...................................... 512 | |
short_seq_prob .................................. 0.1 | |
split ........................................... 34,33,33 | |
tensor_model_parallel_size ...................... 1 | |
tensorboard_dir ................................. None | |
titles_data_path ................................ None | |
tokenizer_type .................................. BertWordPieceLowerCase | |
train_iters ..................................... 2000000 | |
train_samples ................................... None | |
use_checkpoint_lr_scheduler ..................... False | |
use_cpu_initialization .......................... False | |
use_one_sent_docs ............................... False | |
vocab_file ...................................... bert-vocab.txt | |
weight_decay .................................... 0.01 | |
world_size ...................................... 1 | |
-------------------- end of arguments --------------------- | |
setting number of micro-batches to constant 2 | |
> building BertWordPieceLowerCase tokenizer ... | |
> padded vocab (size: 13) with 115 dummy tokens (new size: 128) | |
> initializing torch distributed ... | |
> initializing tensor model parallel with size 1 | |
> initializing pipeline model parallel with size 1 | |
> setting random seeds to 1234 ... | |
> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 | |
time to initialize megatron (seconds): 10.515 | |
[after megatron is initialized] datetime: 2021-02-01 23:08:58 | |
building BERT model ... | |
> number of parameters on (tensor, pipeline) model parallel rank (0, 0): 305072258 | |
> learning rate decay style: linear | |
WARNING: could not find the metadata file checkpoints/bert_345m/latest_checkpointed_iteration.txt | |
will not load any checkpoints and will start from random | |
time (ms) | load checkpoint: 0.10 | |
[after model, optimizer, and learning rate scheduler are built] datetime: 2021-02-01 23:08:58 | |
> building train, validation, and test datasets ... | |
> datasets target sizes (minimum size): | |
train: 16000000 | |
validation: 1600080 | |
test: 80 | |
> building train, validation, and test datasets for BERT ... | |
> building dataset index ... | |
reading sizes... | |
reading pointers... | |
reading document index... | |
creating numpy buffer of mmap... | |
creating memory view of numpy buffer... | |
> finished creating indexed dataset in 0.000763 seconds | |
> indexed dataset stats: | |
number of documents: 2 | |
number of sentences: 2 | |
> dataset split: | |
train: | |
document indices in [0, 0) total of 0 documents | |
sentence indices in [0, 0) total of 0 sentences | |
validation: | |
document indices in [0, 1) total of 1 documents | |
sentence indices in [0, 1) total of 1 sentences | |
test: | |
document indices in [1, 2) total of 1 documents | |
sentence indices in [1, 2) total of 1 sentences | |
> WARNING: could not find index map file my-bert_text_sentence_valid_indexmap_1600080mns_512msl_0.10ssp_1234s.npy, building the indices on rank 0 ... | |
> building sapmles index mapping for valid ... | |
using uint32 for data mapping... | |
using: | |
number of documents: 1 | |
sentences range: [0, 1) | |
total number of sentences: 1 | |
number of epochs: 2147483646 | |
maximum number of samples: 1600080 | |
maximum sequence length: 509 | |
short sequence probability: 0.1 | |
short sequence ration (1/prob): 10 | |
seed: 1234 | |
number of empty documents: 0 | |
number of documents with one sentence: 1 | |
number of documents with long sentences: 0 | |
will create mapping for 0 samples | |
> done building sapmles index maping | |
> saved the index mapping in my-bert_text_sentence_valid_indexmap_1600080mns_512msl_0.10ssp_1234s.npy | |
> elasped time to build and save samples mapping (seconds): 14.715124 | |
> loading indexed mapping from my-bert_text_sentence_valid_indexmap_1600080mns_512msl_0.10ssp_1234s.npy | |
loaded indexed file in 0.001 seconds | |
total number of samples: 0 | |
> WARNING: could not find index map file my-bert_text_sentence_test_indexmap_80mns_512msl_0.10ssp_1234s.npy, building the indices on rank 0 ... | |
> building sapmles index mapping for test ... | |
using uint32 for data mapping... | |
using: | |
number of documents: 1 | |
sentences range: [1, 2) | |
total number of sentences: 1 | |
number of epochs: 2147483646 | |
maximum number of samples: 80 | |
maximum sequence length: 509 | |
short sequence probability: 0.1 | |
short sequence ration (1/prob): 10 | |
seed: 1234 | |
number of empty documents: 0 | |
number of documents with one sentence: 1 | |
number of documents with long sentences: 0 | |
will create mapping for 0 samples | |
> done building sapmles index maping | |
> saved the index mapping in my-bert_text_sentence_test_indexmap_80mns_512msl_0.10ssp_1234s.npy | |
> elasped time to build and save samples mapping (seconds): 14.168460 | |
> loading indexed mapping from my-bert_text_sentence_test_indexmap_80mns_512msl_0.10ssp_1234s.npy | |
loaded indexed file in 0.001 seconds | |
total number of samples: 0 | |
> finished creating BERT datasets ... | |
Traceback (most recent call last): | |
File "pretrain_bert.py", line 155, in <module> | |
args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) | |
File "/data/users/pritam/Megatron-LM/megatron/training.py", line 116, in pretrain | |
train_valid_test_dataset_provider) | |
File "/data/users/pritam/Megatron-LM/megatron/training.py", line 1000, in build_train_valid_test_data_iterators | |
valid_ds, args.consumed_valid_samples) | |
File "/data/users/pritam/Megatron-LM/megatron/data/data_loaders.py", line 38, in build_pretraining_data_loader | |
data_parallel_size=mpu.get_data_parallel_world_size()) | |
File "/data/users/pritam/Megatron-LM/megatron/data/data_loaders.py", line 62, in __init__ | |
'no sample to consume: {}'.format(self.total_samples) | |
AssertionError: no sample to consume: 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment