-
-
Save aflah02/e1541111956d9721b125ffc1ff34cd93 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Creating hostfile... | |
| Hostfile created. | |
| [2025-01-09 09:54:53,173] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| NeoXArgs.from_ymls() ['./configs/6-7B.yml', './configs/local_setup_wandb_modified_with_slurm.yml'] | |
| -------------------- arguments -------------------- | |
| attention_config ................ ['flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash']updated | |
| batch_size ...................... 1...........................updated | |
| checkpoint_activations .......... True........................updated | |
| checkpoint_factor ............... 10000.......................updated | |
| config_files .................... {'6-7B.yml': '# GPT-2 pretraining setup\n{\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n "pipe_parallel_size": 1,\n "model_parallel_size": 2,\n\n # model settings\n "num_layers": 32,\n "hidden_size": 4096,\n "num_attention_heads": 32,\n "seq_length": 2048,\n "max_position_embeddings": 2048,\n "norm": "layernorm",\n "pos_emb": "rotary",\n "no_weight_tying": true,\n "gpt_j_residual": false,\n "output_layer_parallelism": "column",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n "scaled_upper_triang_masked_softmax_fusion": false,\n "bias_gelu_fusion": false,\n "rope_fusion": false,\n "layernorm_fusion": false,\n\n # init methods\n "init_method": "small_init",\n "output_layer_init_method": "wang_init",\n\n # optimizer settings\n "optimizer": {\n "type": "Adam",\n "params": {\n "lr": 0.00012,\n "betas": [0.9, 0.95],\n "eps": 1.0e-8,\n }\n },\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n "zero_optimization": {\n "stage": 1,\n "allgather_partitions": True,\n "allgather_bucket_size": 500000000,\n "overlap_comm": True,\n "reduce_scatter": True,\n "reduce_bucket_size": 500000000,\n "contiguous_gradients": True,\n },\n "min_lr": 0.000012,\n\n # batch / data settings\n "train_micro_batch_size_per_gpu": 1,\n "data_impl": "mmap",\n\n # activation checkpointing\n "checkpoint_activations": true,\n "checkpoint_num_layers": 1,\n "partition_activations": true,\n "synchronize_each_layer": true,\n\n # regularization\n "gradient_clipping": 1.0,\n "weight_decay": 0.1,\n "hidden_dropout": 0,\n "attention_dropout": 0,\n # Flash Attention\n "attention_config": [[["flash"], 32]],\n # precision settings\n "fp16": {\n "fp16": true,\n "enabled": true,\n "loss_scale": 0,\n "loss_scale_window": 1000,\n "hysteresis": 2,\n "min_loss_scale": 1\n },\n\n # misc. training settings\n "train_iters": 320000,\n "lr_decay_iters": 320000,\n "distributed_backend": "nccl",\n "lr_decay_style": "cosine",\n "warmup": 0.01,\n "checkpoint_factor": 10000,\n "eval_interval": 1000,\n "eval_iters": 10,\n\n # logging\n "log_interval": 100,\n "steps_per_print": 10,\n "keep_last_n_checkpoints": 4,\n "wall_clock_breakdown": true,\n\n "memory_profiling": true,\n "memory_profiling_path": "/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/Artifacts/Profiles",\n "profile_step_start": 0,\n "profile_step_stop": 100\n}\n', 'local_setup_wandb_modified_with_slurm.yml': '# Suggested data paths when using GPT-NeoX locally\n{\n "data_path": "data/enwik8/enwik8_text_document",\n\n # or for weighted datasets:\n # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "train-data-weights": [1., 2.],\n # "test-data-weights": [2., 1.],\n # "valid-data-weights": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n # WARNING: setting this to True will override any user provided weights\n # "weight_by_num_documents": false,\n # "weighted_sampler_alpha": 0.3,\n\n "vocab_file": "data/gpt2-vocab.json",\n "merge_file": "data/gpt2-merges.txt",\n\n "save": "checkpoints",\n "load": "checkpoints",\n "checkpoint_validation_with_forward_pass": False,\n\n "tensorboard_dir": "tensorboard",\n "log_dir": "logs",\n "use_wandb": True,\n "wandb_host": "https://api.wandb.ai",\n "wandb_project": "neox",\n "wandb_run_name": "6.7B-FA-BS-1-2x8xA100",\n\n "peak_theoretical_tflops": 312,\n\n "launcher": "slurm",\n "deepspeed_slurm": true,\n "no_ssh_check": true,\n}\n'}updated | |
| data_impl ....................... mmap........................updated | |
| data_path ....................... data/enwik8/enwik8_text_documentupdated | |
| deepspeed_slurm ................. True........................updated | |
| dynamic_loss_scale .............. True........................updated | |
| eval_iters ...................... 10..........................updated | |
| fp16 ............................ {'fp16': True, 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}updated | |
| global_num_gpus ................. 16..........................updated | |
| hidden_size ..................... 4096........................updated | |
| init_method ..................... small_init..................updated | |
| is_pipe_parallel ................ True........................updated | |
| keep_last_n_checkpoints ......... 4...........................updated | |
| launcher ........................ slurm.......................updated | |
| load ............................ checkpoints.................updated | |
| log_dir ......................... logs........................updated | |
| lr .............................. 0.00012.....................updated | |
| lr_decay_iters .................. 320000......................updated | |
| lr_decay_style .................. cosine......................updated | |
| max_position_embeddings ......... 2048........................updated | |
| memory_profiling ................ True........................updated | |
| memory_profiling_path ........... /NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/Artifacts/Profilesupdated | |
| merge_file ...................... data/gpt2-merges.txt........updated | |
| min_lr .......................... 1.2e-05.....................updated | |
| model_parallel_size ............. 2...........................updated | |
| no_ssh_check .................... True........................updated | |
| no_weight_tying ................. True........................updated | |
| num_attention_heads ............. 32..........................updated | |
| num_layers ...................... 32..........................updated | |
| optimizer ....................... {'type': 'Adam', 'params': {'lr': 0.00012, 'betas': [0.9, 0.95], 'eps': 1e-08}}updated | |
| optimizer_type .................. Adam........................updated | |
| output_layer_init_method ........ wang_init...................updated | |
| partition_activations ........... True........................updated | |
| peak_theoretical_tflops ......... 312.........................updated | |
| pipe_parallel_size .............. 1...........................updated | |
| pos_emb ......................... rotary......................updated | |
| precision ....................... fp16........................updated | |
| profile_step_start .............. 0...........................updated | |
| profile_step_stop ............... 100.........................updated | |
| save ............................ checkpoints.................updated | |
| seq_length ...................... 2048........................updated | |
| sparsity_config ................. {}..........................updated | |
| synchronize_each_layer .......... True........................updated | |
| tensorboard_dir ................. tensorboard.................updated | |
| text_gen_type ................... unconditional...............updated | |
| train_batch_size ................ 8...........................updated | |
| train_iters ..................... 320000......................updated | |
| train_micro_batch_size_per_gpu .. 1...........................updated | |
| use_wandb ....................... True........................updated | |
| user_script ..................... train.py....................updated | |
| vocab_file ...................... data/gpt2-vocab.json........updated | |
| wall_clock_breakdown ............ True........................updated | |
| wandb_group ..................... 5i4cy3vh_n5bc7y7e...........updated | |
| wandb_run_name .................. 6.7B-FA-BS-1-2x8xA100.......updated | |
| zero_allgather_bucket_size ...... 500000000...................updated | |
| zero_contiguous_gradients ....... True........................updated | |
| zero_optimization ............... {'stage': 1, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True}updated | |
| zero_reduce_bucket_size ......... 500000000...................updated | |
| zero_reduce_scatter ............. True........................updated | |
| zero_stage ...................... 1...........................updated | |
| account ......................... None........................default | |
| activation ...................... gelu........................default | |
| activation_checkpointing ........ None........................default | |
| adlr_autoresume ................. False.......................default | |
| adlr_autoresume_interval ........ 1000........................default | |
| allow_chopped ................... True........................default | |
| amp ............................. None........................default | |
| apply_query_key_layer_scaling ... False.......................default | |
| attention_dropout ............... 0...........................default | |
| attention_softmax_in_fp32 ....... False.......................default | |
| autotuning ...................... None........................default | |
| autotuning_run .................. None........................default | |
| base_shapes_file ................ None........................default | |
| bf16 ............................ None........................default | |
| bias_dropout_fusion ............. False.......................default | |
| bias_gelu_fusion ................ False.......................default | |
| char_level_ppl .................. False.......................default | |
| checkpoint ...................... None........................default | |
| checkpoint_in_cpu ............... False.......................default | |
| checkpoint_num_layers ........... 1...........................default | |
| checkpoint_scale ................ linear......................default | |
| checkpoint_validation_with_forward_pass False................default | |
| clip_grad ....................... 1.0.........................default | |
| comet_experiment ................ None........................default | |
| comet_experiment_name ........... None........................default | |
| comet_others .................... None........................default | |
| comet_project ................... None........................default | |
| comet_tags ...................... None........................default | |
| comet_workspace ................. None........................default | |
| comment ......................... None........................default | |
| comms_logger .................... None........................default | |
| communication_data_type ......... None........................default | |
| compression_training ............ None........................default | |
| contiguous_checkpointing ........ False.......................default | |
| coord_check ..................... False.......................default | |
| create_moe_param_group .......... True........................default | |
| csv_monitor ..................... None........................default | |
| curriculum_learning ............. None........................default | |
| curriculum_seqlen ............... 0...........................default | |
| data_efficiency ................. None........................default | |
| data_types ...................... None........................default | |
| dataset_impl .................... gpt2........................default | |
| deepscale ....................... False.......................default | |
| deepscale_config ................ None........................default | |
| deepspeed ....................... True........................default | |
| deepspeed_activation_checkpointing True......................default | |
| deepspeed_extra_args ............ None........................default | |
| deepspeed_mpi ................... False.......................default | |
| detect_nvlink_pairs ............. False.......................default | |
| dim_att ......................... None........................default | |
| distributed_backend ............. nccl........................default | |
| do_test ......................... None........................default | |
| do_train ........................ None........................default | |
| do_valid ........................ None........................default | |
| dpo_beta ........................ 0.1.........................default | |
| dpo_fp32 ........................ True........................default | |
| dpo_reference_free .............. False.......................default | |
| dump_state ...................... False.......................default | |
| elasticity ...................... None........................default | |
| enable_expert_tensor_parallelism False.......................default | |
| eod_mask_loss ................... False.......................default | |
| eval_interval ................... 1000........................default | |
| eval_results_prefix ............. ............................default | |
| eval_tasks ...................... None........................default | |
| exclude ......................... None........................default | |
| exit_interval ................... None........................default | |
| expansion_factor ................ None........................default | |
| expert_interval ................. 2...........................default | |
| extra_save_iters ................ None........................default | |
| ffn_dim ......................... None........................default | |
| finetune ........................ False.......................default | |
| flops_profiler .................. None........................default | |
| force_multi ..................... False.......................default | |
| fp16_lm_cross_entropy ........... False.......................default | |
| fp32_allreduce .................. False.......................default | |
| fp32_reinforce .................. True........................default | |
| git_hash ........................ dc454ee.....................default | |
| gmlp_attn_dim ................... 64..........................default | |
| gpt_j_residual .................. False.......................default | |
| gpt_j_tied ...................... False.......................default | |
| gradient_accumulation_steps ..... 1...........................default | |
| gradient_clipping ............... 1.0.........................default | |
| gradient_noise_scale_cpu_offload False.......................default | |
| gradient_noise_scale_n_batches .. 5...........................default | |
| gradient_predivide_factor ....... 1.0.........................default | |
| head_size ....................... None........................default | |
| hidden_dropout .................. 0...........................default | |
| hostfile ........................ None........................default | |
| hysteresis ...................... 2...........................default | |
| include ......................... None........................default | |
| init_method_std ................. 0.02........................default | |
| intermediate_size ............... None........................default | |
| iteration ....................... None........................default | |
| kl_div_beta ..................... 0.1.........................default | |
| kl_impl ......................... mse.........................default | |
| kto_beta ........................ 0.1.........................default | |
| kto_desirable_weight ............ 1.0.........................default | |
| kto_fp32 ........................ True........................default | |
| kto_undesirable_weight .......... 1.0.........................default | |
| layernorm_epsilon ............... 1e-05.......................default | |
| layernorm_fusion ................ False.......................default | |
| lazy_mpu_init ................... False.......................default | |
| local_rank ...................... None........................default | |
| log_grad_norm ................... False.......................default | |
| log_grad_pct_zeros .............. False.......................default | |
| log_gradient_noise_scale ........ False.......................default | |
| log_interval .................... 100.........................default | |
| log_optimizer_states ............ False.......................default | |
| log_param_norm .................. False.......................default | |
| loss_scale ...................... None........................default | |
| loss_scale_window ............... 1000.0......................default | |
| lr_decay_fraction ............... None........................default | |
| make_vocab_size_divisible_by .... 128.........................default | |
| mamba_causal_conv_fusion ........ False.......................default | |
| mamba_inner_func_fusion ......... False.......................default | |
| mamba_selective_fp32_params ..... True........................default | |
| mamba_selective_scan_fusion ..... False.......................default | |
| mamba_use_bias_in_conv .......... True........................default | |
| mamba_use_bias_in_linears ....... False.......................default | |
| master_addr ..................... None........................default | |
| master_port ..................... 29500.......................default | |
| maximum_tokens .................. 64..........................default | |
| min_scale ....................... 1.0.........................default | |
| mlp_multiple_of ................. 1...........................default | |
| mmap_warmup ..................... False.......................default | |
| moe_eval_capacity_factor ........ 1.0.........................default | |
| moe_expert_parallel_size ........ 1...........................default | |
| moe_glu ......................... False.......................default | |
| moe_jitter_eps .................. None........................default | |
| moe_lbl_in_fp32 ................. False.......................default | |
| moe_loss_coeff .................. 0.1.........................default | |
| moe_min_capacity ................ 4...........................default | |
| moe_num_experts ................. 1...........................default | |
| moe_token_dropping .............. False.......................default | |
| moe_top_k ....................... 1...........................default | |
| moe_train_capacity_factor ....... 1.0.........................default | |
| moe_type ........................ megablocks..................default | |
| moe_use_residual ................ True........................default | |
| mup_attn_temp ................... 1.0.........................default | |
| mup_embedding_mult .............. 1.0.........................default | |
| mup_init_scale .................. 1.0.........................default | |
| mup_output_temp ................. 1.0.........................default | |
| mup_rp_embedding_mult ........... 1.0.........................default | |
| mup_width_scale ................. 2...........................default | |
| neg_test_data_paths ............. None........................default | |
| neg_test_label_data_paths ....... None........................default | |
| neg_train_data_paths ............ None........................default | |
| neg_train_label_data_paths ...... None........................default | |
| neg_valid_data_paths ............ None........................default | |
| neg_valid_label_data_paths ...... None........................default | |
| no_load_optim ................... False.......................default | |
| no_load_rng ..................... False.......................default | |
| no_save_optim ................... False.......................default | |
| no_save_rng ..................... False.......................default | |
| norm ............................ layernorm...................default | |
| num_gpus ........................ None........................default | |
| num_kv_heads .................... None........................default | |
| num_nodes ....................... -1..........................default | |
| num_samples ..................... 1...........................default | |
| num_unique_layers ............... None........................default | |
| num_workers ..................... 2...........................default | |
| online_dataserver_ips ........... localhost...................default | |
| online_dataserver_ports ......... 10000.......................default | |
| onnx_safe ....................... False.......................default | |
| opt_pos_emb_offset .............. 0...........................default | |
| output_layer_parallelism ........ column......................default | |
| override_lr_scheduler ........... False.......................default | |
| pack_impl ....................... packed......................default | |
| padded_vocab_size ............... None........................default | |
| param_sharing_style ............. grouped.....................default | |
| pipe_partition_method ........... type:transformer|mlp........default | |
| pos_test_data_paths ............. None........................default | |
| pos_test_label_data_paths ....... None........................default | |
| pos_train_data_paths ............ None........................default | |
| pos_train_label_data_paths ...... None........................default | |
| pos_valid_data_paths ............ None........................default | |
| pos_valid_label_data_paths ...... None........................default | |
| precompute_model_name ........... None........................default | |
| prescale_gradients .............. False.......................default | |
| profile ......................... False.......................default | |
| profile_backward ................ False.......................default | |
| prompt_end ...................... | |
| ...........................default | |
| rank ............................ None........................default | |
| recompute ....................... False.......................default | |
| reinforce_leave_one_out ......... False.......................default | |
| return_logits ................... False.......................default | |
| rms_norm_epsilon ................ 1e-08.......................default | |
| rmsnorm_fusion .................. False.......................default | |
| rope_fusion ..................... False.......................default | |
| rotary_emb_base ................. 10000.......................default | |
| rotary_pct ...................... 1.0.........................default | |
| rotary_save_freqs_buffer ........ False.......................default | |
| rpe_max_distance ................ 128.........................default | |
| rpe_num_buckets ................. 32..........................default | |
| s3_chunk_size ................... 104857600...................default | |
| s3_path ......................... None........................default | |
| sample_input_file ............... None........................default | |
| sample_output_file .............. samples.txt.................default | |
| save_base_shapes ................ False.......................default | |
| scaled_masked_softmax_fusion .... False.......................default | |
| scaled_upper_triang_masked_softmax_fusion False..............default | |
| scalenorm_epsilon ............... 1e-08.......................default | |
| scheduler ....................... None........................default | |
| seed ............................ 1234........................default | |
| sequence_parallel ............... False.......................default | |
| serve_model_weights ............. False.......................default | |
| short_seq_prob .................. 0.1.........................default | |
| sliding_window_width ............ None........................default | |
| soft_prompt_tuning .............. None........................default | |
| sparse_attention ................ None........................default | |
| sparse_gradients ................ False.......................default | |
| split ........................... 969, 30, 1..................default | |
| steps_per_print ................. 10..........................default | |
| te_columnparallel ............... False.......................default | |
| te_fp8_amax_compute_algo ........ most_recent.................default | |
| te_fp8_amax_history_len ......... 1...........................default | |
| te_fp8_format ................... hybrid......................default | |
| te_fp8_margin ................... 0...........................default | |
| te_fp8_mha ...................... False.......................default | |
| te_fp8_wgrad .................... True........................default | |
| te_layernorm_mlp ................ False.......................default | |
| te_mha .......................... False.......................default | |
| te_rowparallel .................. False.......................default | |
| temperature ..................... 0.0.........................default | |
| tensorboard ..................... None........................default | |
| test_data_paths ................. None........................default | |
| test_data_weights ............... None........................default | |
| test_label_data_paths ........... None........................default | |
| test_reward_data_paths .......... None........................default | |
| tokenizer_type .................. GPT2BPETokenizer............default | |
| top_k ........................... 0...........................default | |
| top_p ........................... 0.0.........................default | |
| train_data_paths ................ None........................default | |
| train_data_weights .............. None........................default | |
| train_epochs .................... None........................default | |
| train_impl ...................... normal......................default | |
| train_label_data_paths .......... None........................default | |
| train_reward_data_paths ......... None........................default | |
| use_bias_in_attn_linear ......... True........................default | |
| use_bias_in_mlp ................. True........................default | |
| use_bias_in_norms ............... True........................default | |
| use_bnb_optimizer ............... False.......................default | |
| use_checkpoint_lr_scheduler ..... False.......................default | |
| use_comet ....................... None........................default | |
| use_cpu_initialization .......... False.......................default | |
| use_flashattn_swiglu ............ False.......................default | |
| use_mup ......................... False.......................default | |
| use_qk_layernorm ................ False.......................default | |
| use_shared_fs ................... True........................default | |
| use_tutel ....................... False.......................default | |
| valid_data_paths ................ None........................default | |
| valid_data_weights .............. None........................default | |
| valid_label_data_paths .......... None........................default | |
| valid_reward_data_paths ......... None........................default | |
| wandb ........................... None........................default | |
| wandb_host ...................... https://api.wandb.ai........default | |
| wandb_init_all_ranks ............ False.......................default | |
| wandb_project ................... neox........................default | |
| wandb_team ...................... None........................default | |
| warmup .......................... 0.01........................default | |
| weight_by_num_documents ......... False.......................default | |
| weight_decay .................... 0.1.........................default | |
| weight_server_port .............. 6000........................default | |
| weighted_sampler_alpha .......... 1.0.........................default | |
| world_size ...................... None........................default | |
| z_loss .......................... 0.0.........................default | |
| ---------------- end of arguments ---------------- | |
| NeoXArgs.configure_distributed_args() using world size: 16 and model-parallel size: 2 | |
| [2025-01-09 09:55:17,930] [INFO] [runner.py:586:main] cmd = srun -n 16 --export=ALL,WANDB_API_KEY=REMOVED,PYTHONPATH=/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox /NS/venvs/work/afkhan/neoxolmo/bin/python -u train.py --deepspeed_config eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDEsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxMiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMSwgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ== --megatron_config eyJsYXVuY2hlciI6ICJzbHVybSIsICJub19zc2hfY2hlY2siOiB0cnVlLCAidHJhaW5fYmF0Y2hfc2l6ZSI6IDgsICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiAxLCAib3B0aW1pemVyIjogeyJ0eXBlIjogIkFkYW0iLCAicGFyYW1zIjogeyJsciI6IDAuMDAwMTIsICJiZXRhcyI6IFswLjksIDAuOTVdLCAiZXBzIjogMWUtMDh9fSwgImZwMTYiOiB7ImZwMTYiOiB0cnVlLCAiZW5hYmxlZCI6IHRydWUsICJsb3NzX3NjYWxlIjogMCwgImxvc3Nfc2NhbGVfd2luZG93IjogMTAwMCwgImh5c3RlcmVzaXMiOiAyLCAibWluX2xvc3Nfc2NhbGUiOiAxfSwgInplcm9fb3B0aW1pemF0aW9uIjogeyJzdGFnZSI6IDEsICJhbGxnYXRoZXJfcGFydGl0aW9ucyI6IHRydWUsICJhbGxnYXRoZXJfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJvdmVybGFwX2NvbW0iOiB0cnVlLCAicmVkdWNlX3NjYXR0ZXIiOiB0cnVlLCAicmVkdWNlX2J1Y2tldF9zaXplIjogNTAwMDAwMDAwLCAiY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlfSwgIndhbGxfY2xvY2tfYnJlYWtkb3duIjogdHJ1ZSwgInByZWNpc2lvbiI6ICJmcDE2IiwgIm51bV9sYXllcnMiOiAzMiwgImhpZGRlbl9zaXplIjogNDA5NiwgIm51bV9hdHRlbnRpb25faGVhZHMiOiAzMiwgInNlcV9sZW5ndGgiOiAyMDQ4LCAibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3MiOiAyMDQ4LCAicG9zX2VtYiI6ICJyb3RhcnkiLCAibm9fd2VpZ2h0X3R5aW5nIjogdHJ1ZSwgImF0dGVudGlvbl9jb25maWciOiBbImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIiwgImZsYXNoIl0sICJzcGFyc2l0eV9jb25maWciOiB7fSwgImluaXRfbWV0aG9kIjogInNtYWxsX2luaXQiLCAib3V0cHV0X2xheWVyX2luaXRfbWV0aG9kIjogIndhbmdfaW5pdCIsICJscl9kZWNheV9zdHlsZSI6ICJjb3NpbmUiLCAibHJfZGVjYXlfaXRlcnMiOiAzMjAwMDAsICJtaW5fbHIiOiAxLjJlLTA1LCAib3B0aW1pemVyX3R5cGUiOiAiQWRhbSIsICJ6ZXJvX3N0YWdlIjogMSwgInplcm9fcmVkdWNlX3NjYXR0ZXIiOiB0cnVlLCAiemVyb19jb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWUsICJ6ZXJvX3JlZHVjZV9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgInplcm9fYWxsZ2F0aGVyX2J1Y2tldF9zaXplIjogNTAwMDAwMDAwLCAibHIiOiAwLjAwMDEyLCAiZGF0YV9wYXRoIjogImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyI2LTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDEsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMixcblxuICAgIyBtb2RlbCBzZXR0aW5nc1xuICAgXCJudW1fbGF5ZXJzXCI6IDMyLFxuICAgXCJoaWRkZW5fc2l6ZVwiOiA0MDk2LFxuICAgXCJudW1fYXR0ZW50aW9uX2hlYWRzXCI6IDMyLFxuICAgXCJzZXFfbGVuZ3RoXCI6IDIwNDgsXG4gICBcIm1heF9wb3NpdGlvbl9lbWJlZGRpbmdzXCI6IDIwNDgsXG4gICBcIm5vcm1cIjogXCJsYXllcm5vcm1cIixcbiAgIFwicG9zX2VtYlwiOiBcInJvdGFyeVwiLFxuICAgXCJub193ZWlnaHRfdHlpbmdcIjogdHJ1ZSxcbiAgIFwiZ3B0X2pfcmVzaWR1YWxcIjogZmFsc2UsXG4gICBcIm91dHB1dF9sYXllcl9wYXJhbGxlbGlzbVwiOiBcImNvbHVtblwiLFxuXG4gICAjIHRoZXNlIHNob3VsZCBwcm92aWRlIHNvbWUgc3BlZWR1cCBidXQgdGFrZXMgYSB3aGlsZSB0byBidWlsZCwgc2V0IHRvIHRydWUgaWYgZGVzaXJlZFxuICAgXCJzY2FsZWRfdXBwZXJfdHJpYW5nX21hc2tlZF9zb2Z0bWF4X2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwiYmlhc19nZWx1X2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwicm9wZV9mdXNpb25cIjogZmFsc2UsXG4gICBcImxheWVybm9ybV9mdXNpb25cIjogZmFsc2UsXG5cbiAgICMgaW5pdCBtZXRob2RzXG4gICBcImluaXRfbWV0aG9kXCI6IFwic21hbGxfaW5pdFwiLFxuICAgXCJvdXRwdXRfbGF5ZXJfaW5pdF9tZXRob2RcIjogXCJ3YW5nX2luaXRcIixcblxuICAgIyBvcHRpbWl6ZXIgc2V0dGluZ3NcbiAgIFwib3B0aW1pemVyXCI6IHtcbiAgICAgXCJ0eXBlXCI6IFwiQWRhbVwiLFxuICAgICBcInBhcmFtc1wiOiB7XG4gICAgICAgXCJsclwiOiAwLjAwMDEyLFxuICAgICAgIFwiYmV0YXNcIjogWzAuOSwgMC45NV0sXG4gICAgICAgXCJlcHNcIjogMS4wZS04LFxuICAgICB9XG4gICB9LFxuXG4gICAjIGZvciBhbGwgemVyb19vcHRpbWl6YXRpb24gb3B0aW9ucywgc2VlIGh0dHBzOi8vd3d3LmRlZXBzcGVlZC5haS9kb2NzL2NvbmZpZy1qc29uLyN6ZXJvLW9wdGltaXphdGlvbnMtZm9yLWZwMTYtdHJhaW5pbmdcbiAgIFwiemVyb19vcHRpbWl6YXRpb25cIjoge1xuICAgIFwic3RhZ2VcIjogMSxcbiAgICBcImFsbGdhdGhlcl9wYXJ0aXRpb25zXCI6IFRydWUsXG4gICAgXCJhbGxnYXRoZXJfYnVja2V0X3NpemVcIjogNTAwMDAwMDAwLFxuICAgIFwib3ZlcmxhcF9jb21tXCI6IFRydWUsXG4gICAgXCJyZWR1Y2Vfc2NhdHRlclwiOiBUcnVlLFxuICAgIFwicmVkdWNlX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcImNvbnRpZ3VvdXNfZ3JhZGllbnRzXCI6IFRydWUsXG4gIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxMixcblxuICAgIyBiYXRjaCAvIGRhdGEgc2V0dGluZ3NcbiAgIFwidHJhaW5fbWljcm9fYmF0Y2hfc2l6ZV9wZXJfZ3B1XCI6IDEsXG4gICBcImRhdGFfaW1wbFwiOiBcIm1tYXBcIixcblxuICAgIyBhY3RpdmF0aW9uIGNoZWNrcG9pbnRpbmdcbiAgIFwiY2hlY2twb2ludF9hY3RpdmF0aW9uc1wiOiB0cnVlLFxuICAgXCJjaGVja3BvaW50X251bV9sYXllcnNcIjogMSxcbiAgIFwicGFydGl0aW9uX2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcInN5bmNocm9uaXplX2VhY2hfbGF5ZXJcIjogdHJ1ZSxcblxuICAgIyByZWd1bGFyaXphdGlvblxuICAgXCJncmFkaWVudF9jbGlwcGluZ1wiOiAxLjAsXG4gICBcIndlaWdodF9kZWNheVwiOiAwLjEsXG4gICBcImhpZGRlbl9kcm9wb3V0XCI6IDAsXG4gICBcImF0dGVudGlvbl9kcm9wb3V0XCI6IDAsXG4gICAjIEZsYXNoIEF0dGVudGlvblxuICAgXCJhdHRlbnRpb25fY29uZmlnXCI6IFtbW1wiZmxhc2hcIl0sIDMyXV0sXG4gICAjIHByZWNpc2lvbiBzZXR0aW5nc1xuICAgXCJmcDE2XCI6IHtcbiAgICAgXCJmcDE2XCI6IHRydWUsXG4gICAgIFwiZW5hYmxlZFwiOiB0cnVlLFxuICAgICBcImxvc3Nfc2NhbGVcIjogMCxcbiAgICAgXCJsb3NzX3NjYWxlX3dpbmRvd1wiOiAxMDAwLFxuICAgICBcImh5c3RlcmVzaXNcIjogMixcbiAgICAgXCJtaW5fbG9zc19zY2FsZVwiOiAxXG4gICB9LFxuXG4gICAjIG1pc2MuIHRyYWluaW5nIHNldHRpbmdzXG4gICBcInRyYWluX2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwibHJfZGVjYXlfaXRlcnNcIjogMzIwMDAwLFxuICAgXCJkaXN0cmlidXRlZF9iYWNrZW5kXCI6IFwibmNjbFwiLFxuICAgXCJscl9kZWNheV9zdHlsZVwiOiBcImNvc2luZVwiLFxuICAgXCJ3YXJtdXBcIjogMC4wMSxcbiAgIFwiY2hlY2twb2ludF9mYWN0b3JcIjogMTAwMDAsXG4gICBcImV2YWxfaW50ZXJ2YWxcIjogMTAwMCxcbiAgIFwiZXZhbF9pdGVyc1wiOiAxMCxcblxuICAgIyBsb2dnaW5nXG4gICBcImxvZ19pbnRlcnZhbFwiOiAxMDAsXG4gICBcInN0ZXBzX3Blcl9wcmludFwiOiAxMCxcbiAgIFwia2VlcF9sYXN0X25fY2hlY2twb2ludHNcIjogNCxcbiAgIFwid2FsbF9jbG9ja19icmVha2Rvd25cIjogdHJ1ZSxcblxuICAgXCJtZW1vcnlfcHJvZmlsaW5nXCI6IHRydWUsXG4gICBcIm1lbW9yeV9wcm9maWxpbmdfcGF0aFwiOiBcIi9OUy9sbG0tcHJldHJhaW5pbmcvd29yay9hZmtoYW4vVVNDX0NvbGFiL2dwdC1uZW94L0FydGlmYWN0cy9Qcm9maWxlc1wiLFxuICAgXCJwcm9maWxlX3N0ZXBfc3RhcnRcIjogMCxcbiAgIFwicHJvZmlsZV9zdGVwX3N0b3BcIjogMTAwXG59XG4iLCAibG9jYWxfc2V0dXBfd2FuZGJfbW9kaWZpZWRfd2l0aF9zbHVybS55bWwiOiAiIyBTdWdnZXN0ZWQgZGF0YSBwYXRocyB3aGVuIHVzaW5nIEdQVC1OZW9YIGxvY2FsbHlcbntcbiAgXCJkYXRhX3BhdGhcIjogXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLFxuXG4gICMgb3IgZm9yIHdlaWdodGVkIGRhdGFzZXRzOlxuICAjIFwidHJhaW4tZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidGVzdC1kYXRhLXBhdGhzXCI6IFtcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCIsIFwiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIl0sXG4gICMgXCJ2YWxpZC1kYXRhLXBhdGhzXCI6IFtcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCIsIFwiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIl0sXG4gICMgXCJ0cmFpbi1kYXRhLXdlaWdodHNcIjogWzEuLCAyLl0sXG4gICMgXCJ0ZXN0LWRhdGEtd2VpZ2h0c1wiOiBbMi4sIDEuXSxcbiAgIyBcInZhbGlkLWRhdGEtd2VpZ2h0c1wiOiBbMC41LCAwLjRdLFxuXG4gICMgSWYgd2VpZ2h0X2J5X251bV9kb2N1bWVudHMgaXMgVHJ1ZSwgQnVpbGRzIGRhdGFzZXQgd2VpZ2h0cyBmcm9tIGEgbXVsdGlub21pYWwgZGlzdHJpYnV0aW9uIG92ZXIgZ3JvdXBzIG9mIGRhdGEgYWNjb3JkaW5nIHRvIHRoZSBudW1iZXIgb2YgZG9jdW1lbnRzIGluIGVhY2ggZ3JvdXAuXG4gICMgV0FSTklORzogc2V0dGluZyB0aGlzIHRvIFRydWUgd2lsbCBvdmVycmlkZSBhbnkgdXNlciBwcm92aWRlZCB3ZWlnaHRzXG4gICMgXCJ3ZWlnaHRfYnlfbnVtX2RvY3VtZW50c1wiOiBmYWxzZSxcbiAgIyBcIndlaWdodGVkX3NhbXBsZXJfYWxwaGFcIjogMC4zLFxuXG4gIFwidm9jYWJfZmlsZVwiOiBcImRhdGEvZ3B0Mi12b2NhYi5qc29uXCIsXG4gIFwibWVyZ2VfZmlsZVwiOiBcImRhdGEvZ3B0Mi1tZXJnZXMudHh0XCIsXG5cbiAgXCJzYXZlXCI6IFwiY2hlY2twb2ludHNcIixcbiAgXCJsb2FkXCI6IFwiY2hlY2twb2ludHNcIixcbiAgXCJjaGVja3BvaW50X3ZhbGlkYXRpb25fd2l0aF9mb3J3YXJkX3Bhc3NcIjogRmFsc2UsXG5cbiAgXCJ0ZW5zb3Jib2FyZF9kaXJcIjogXCJ0ZW5zb3Jib2FyZFwiLFxuICBcImxvZ19kaXJcIjogXCJsb2dzXCIsXG4gIFwidXNlX3dhbmRiXCI6IFRydWUsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIixcbiAgXCJ3YW5kYl9ydW5fbmFtZVwiOiBcIjYuN0ItRkEtQlMtMS0yeDh4QTEwMFwiLFxuXG4gIFwicGVha190aGVvcmV0aWNhbF90ZmxvcHNcIjogMzEyLFxuXG4gIFwibGF1bmNoZXJcIjogXCJzbHVybVwiLFxuICBcImRlZXBzcGVlZF9zbHVybVwiOiB0cnVlLFxuICBcIm5vX3NzaF9jaGVja1wiOiB0cnVlLFxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogMSwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJkYXRhL2dwdDItdm9jYWIuanNvbiIsICJtZXJnZV9maWxlIjogImRhdGEvZ3B0Mi1tZXJnZXMudHh0IiwgImNoZWNrcG9pbnRfYWN0aXZhdGlvbnMiOiB0cnVlLCAic3luY2hyb25pemVfZWFjaF9sYXllciI6IHRydWUsICJwYXJ0aXRpb25fYWN0aXZhdGlvbnMiOiB0cnVlLCAiZHluYW1pY19sb3NzX3NjYWxlIjogdHJ1ZSwgInBpcGVfcGFyYWxsZWxfc2l6ZSI6IDEsICJtb2RlbF9wYXJhbGxlbF9zaXplIjogMiwgIndvcmxkX3NpemUiOiAxNiwgImlzX3BpcGVfcGFyYWxsZWwiOiB0cnVlLCAidXNlX3dhbmRiIjogdHJ1ZSwgIndhbmRiX2dyb3VwIjogIjVpNGN5M3ZoX241YmM3eTdlIiwgIndhbmRiX3J1bl9uYW1lIjogIjYuN0ItRkEtQlMtMS0yeDh4QTEwMCIsICJsb2dfZGlyIjogImxvZ3MiLCAidGVuc29yYm9hcmRfZGlyIjogInRlbnNvcmJvYXJkIiwgInBlYWtfdGhlb3JldGljYWxfdGZsb3BzIjogMzEyLCAibWVtb3J5X3Byb2ZpbGluZyI6IHRydWUsICJtZW1vcnlfcHJvZmlsaW5nX3BhdGgiOiAiL05TL2xsbS1wcmV0cmFpbmluZy93b3JrL2Fma2hhbi9VU0NfQ29sYWIvZ3B0LW5lb3gvQXJ0aWZhY3RzL1Byb2ZpbGVzIiwgInByb2ZpbGVfc3RlcF9zdGFydCI6IDAsICJwcm9maWxlX3N0ZXBfc3RvcCI6IDEwMCwgInRleHRfZ2VuX3R5cGUiOiAidW5jb25kaXRpb25hbCIsICJsb2NhbF9yYW5rIjogMCwgInJhbmsiOiAwLCAiZGVlcHNwZWVkX3NsdXJtIjogdHJ1ZSwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgImdsb2JhbF9udW1fZ3B1cyI6IDE2fQ== | |
| [2025-01-09 09:55:25,210] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,210] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,216] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,220] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,223] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,223] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:25,223] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:27,574] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,365] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,368] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| [2025-01-09 09:55:36,369] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| NeoXArgs.configure_distributed_args() using world size: 16 and model-parallel size: 2 | |
| > building GPT2BPETokenizer tokenizer ... | |
| > padded vocab (size: 50257) with 175 dummy tokens (new size: 50432) | |
| WARNING: TensorBoard writing requested but is not available (are you using PyTorch 1.1.0 or later and do you have tensorboard installed?), no TensorBoard logs will be written. | |
| [2025-01-09 09:55:39,739] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,749] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,751] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,753] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,756] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,763] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:39,764] [INFO] [comm.py:637:init_distributed] cdb=None | |
| > initializing torch distributed ... | |
| [2025-01-09 09:55:42,670] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:55:42,670] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3 | |
| For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer | |
| [2025-01-09 09:56:03,323] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,325] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,327] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,330] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,333] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,335] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:03,337] [INFO] [comm.py:637:init_distributed] cdb=None | |
| [2025-01-09 09:56:06,760] [INFO] [comm.py:637:init_distributed] cdb=None | |
| > initializing model parallel with size 2 | |
| MPU DP: [0, 2, 4, 6, 8, 10, 12, 14] | |
| MPU DP: [1, 3, 5, 7, 9, 11, 13, 15] | |
| MPU PP: [0] | |
| MPU PP: [1] | |
| MPU PP: [2] | |
| MPU PP: [3] | |
| MPU PP: [4] | |
| MPU PP: [5] | |
| MPU PP: [6] | |
| MPU PP: [7] | |
| MPU PP: [8] | |
| MPU PP: [9] | |
| MPU PP: [10] | |
| MPU PP: [11] | |
| MPU PP: [12] | |
| MPU PP: [13] | |
| MPU PP: [14] | |
| MPU PP: [15] | |
| MPU MP: [0, 1] | |
| MPU MP: [2, 3] | |
| MPU MP: [4, 5] | |
| MPU MP: [6, 7] | |
| MPU MP: [8, 9] | |
| MPU MP: [10, 11] | |
| MPU MP: [12, 13] | |
| MPU MP: [14, 15] | |
| > setting random seeds to 1234 ... | |
| [2025-01-09 09:56:06,815] [INFO] [checkpointing.py:227:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 | |
| make: Entering directory '/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/megatron/data' | |
| make: Entering directory '/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/megatron/data' | |
| make: Nothing to be done for 'default'. | |
| make: Leaving directory '/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/megatron/data' | |
| make: Nothing to be done for 'default'. | |
| make: Leaving directory '/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/megatron/data' | |
| > building train, validation, and test datasets ... | |
| reading sizes... | |
| reading pointers... | |
| reading document index... | |
| creating numpy buffer of mmap... | |
| creating memory view of numpy buffer... | |
| > dataset split: | |
| train: | |
| document indices in [0, 1) total of 1 documents | |
| validation: | |
| document indices in [1, 1) total of 0 documents | |
| test: | |
| document indices in [1, 1) total of 0 documents | |
| > loading doc-idx mapping from data/enwik8/enwik8_text_document_train_indexmap_2560000ns_2048sl_1234s_packedpi_ac_doc_idx.npy | |
| > loading sample-idx mapping from data/enwik8/enwik8_text_document_train_indexmap_2560000ns_2048sl_1234s_packedpi_ac_sample_idx.npy | |
| > loading shuffle-idx mapping from data/enwik8/enwik8_text_document_train_indexmap_2560000ns_2048sl_1234s_packedpi_ac_shuffle_idx.npy | |
| loaded indexed file in 0.040 seconds | |
| total number of samples: 2571718 | |
| total number of epochs: 190 | |
| building GPT2 model ... | |
| SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None | |
| Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15} | |
| [2025-01-09 09:56:11,770] [INFO] [module.py:375:_partition_layers] Partitioning pipeline stages with method type:transformer|mlp | |
| stage=0 layers=37 | |
| 0: EmbeddingPipe | |
| 1: _pre_transformer_block | |
| 2: ParallelTransformerLayerPipe | |
| 3: ParallelTransformerLayerPipe | |
| 4: ParallelTransformerLayerPipe | |
| 5: ParallelTransformerLayerPipe | |
| 6: ParallelTransformerLayerPipe | |
| 7: ParallelTransformerLayerPipe | |
| 8: ParallelTransformerLayerPipe | |
| 9: ParallelTransformerLayerPipe | |
| 10: ParallelTransformerLayerPipe | |
| 11: ParallelTransformerLayerPipe | |
| 12: ParallelTransformerLayerPipe | |
| 13: ParallelTransformerLayerPipe | |
| 14: ParallelTransformerLayerPipe | |
| 15: ParallelTransformerLayerPipe | |
| 16: ParallelTransformerLayerPipe | |
| 17: ParallelTransformerLayerPipe | |
| 18: ParallelTransformerLayerPipe | |
| 19: ParallelTransformerLayerPipe | |
| 20: ParallelTransformerLayerPipe | |
| 21: ParallelTransformerLayerPipe | |
| 22: ParallelTransformerLayerPipe | |
| 23: ParallelTransformerLayerPipe | |
| 24: ParallelTransformerLayerPipe | |
| 25: ParallelTransformerLayerPipe | |
| 26: ParallelTransformerLayerPipe | |
| 27: ParallelTransformerLayerPipe | |
| 28: ParallelTransformerLayerPipe | |
| 29: ParallelTransformerLayerPipe | |
| 30: ParallelTransformerLayerPipe | |
| 31: ParallelTransformerLayerPipe | |
| 32: ParallelTransformerLayerPipe | |
| 33: ParallelTransformerLayerPipe | |
| 34: _post_transformer_block | |
| 35: NormPipe | |
| 36: ParallelLinearPipe | |
| loss: partial | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| Configuring Optimizer type: Adam with params: {'lr': 0.00012, 'betas': [0.9, 0.95], 'eps': 1e-08} | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| WARNING: APEX not installed - defaulting to deepspeed's fused adam | |
| ninja: no work to do. | |
| Time to load fused_adam op: 8.791501998901367 seconds | |
| Time to load fused_adam op: 8.851798295974731 seconds | |
| Time to load fused_adam op: 8.852033138275146 seconds | |
| Time to load fused_adam op: 8.852229118347168 seconds | |
| Time to load fused_adam op: 8.852245807647705 seconds | |
| Time to load fused_adam op: 8.854000091552734 seconds | |
| Time to load fused_adam op: 8.856988668441772 seconds | |
| > learning rate decay style: cosine | |
| DeepSpeed is enabled. | |
| [2025-01-09 09:56:43,823] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.4+02e2ebf, git-hash=02e2ebf, git-branch=HEAD | |
| Time to load fused_adam op: 8.856734991073608 seconds | |
| Time to load fused_adam op: 12.064608335494995 seconds | |
| Time to load fused_adam op: 12.064414978027344 seconds | |
| Time to load fused_adam op: 12.064436197280884 seconds | |
| Time to load fused_adam op: 12.064515352249146 seconds | |
| Time to load fused_adam op: 12.06449031829834 seconds | |
| Time to load fused_adam op: 12.064795732498169 seconds | |
| Time to load fused_adam op: 12.064433574676514 seconds | |
| Time to load fused_adam op: 12.06458568572998 seconds | |
| [2025-01-09 09:56:48,754] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False | |
| [2025-01-09 09:56:48,756] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer | |
| [2025-01-09 09:56:48,756] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer | |
| [2025-01-09 09:56:48,775] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam | |
| [2025-01-09 09:56:48,775] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'> | |
| [2025-01-09 09:56:48,775] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 1 optimizer | |
| [2025-01-09 09:56:48,775] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 500000000 | |
| [2025-01-09 09:56:48,775] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 500000000 | |
| [2025-01-09 09:56:48,775] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False | |
| [2025-01-09 09:56:48,775] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False | |
| [2025-01-09 09:57:16,933] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:17,043] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:17,477] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:17,565] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:17,790] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,293] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,345] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states | |
| [2025-01-09 09:57:20,346] [INFO] [utils.py:803:see_memory_usage] MA 7.98 GB Max_MA 7.98 GB CA 8.01 GB Max_CA 8 GB | |
| [2025-01-09 09:57:20,347] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 18.98 GB, percent = 0.9% | |
| [2025-01-09 09:57:20,414] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,460] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,487] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states | |
| [2025-01-09 09:57:20,488] [INFO] [utils.py:803:see_memory_usage] MA 11.18 GB Max_MA 12.77 GB CA 12.8 GB Max_CA 13 GB | |
| [2025-01-09 09:57:20,488] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 18.98 GB, percent = 0.9% | |
| [2025-01-09 09:57:20,488] [INFO] [stage_1_and_2.py:517:__init__] optimizer state initialized | |
| [2025-01-09 09:57:20,499] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,604] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer | |
| [2025-01-09 09:57:20,605] [INFO] [utils.py:803:see_memory_usage] MA 11.18 GB Max_MA 11.18 GB CA 12.8 GB Max_CA 13 GB | |
| [2025-01-09 09:57:20,605] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 18.98 GB, percent = 0.9% | |
| [2025-01-09 09:57:20,609] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam | |
| [2025-01-09 09:57:20,609] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler | |
| [2025-01-09 09:57:20,609] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <megatron.learning_rates.AnnealingLR object at 0x7fdef4f9a310> | |
| [2025-01-09 09:57:20,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[[0.9, 0.95], [0.9, 0.95]] | |
| [2025-01-09 09:57:20,610] [INFO] [config.py:979:print] DeepSpeedEngine configuration: | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] activation_checkpointing_config { | |
| "partition_activations": false, | |
| "contiguous_memory_optimization": false, | |
| "cpu_checkpointing": false, | |
| "number_checkpoints": null, | |
| "synchronize_checkpoint_boundary": false, | |
| "profile": false | |
| } | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] amp_enabled .................. False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] amp_params ................... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] autotuning_config ............ { | |
| "enabled": false, | |
| "start_step": null, | |
| "end_step": null, | |
| "metric_path": null, | |
| "arg_mappings": null, | |
| "metric": "throughput", | |
| "model_info": null, | |
| "results_dir": "autotuning_results", | |
| "exps_dir": "autotuning_exps", | |
| "overwrite": true, | |
| "fast": true, | |
| "start_profile_step": 3, | |
| "end_profile_step": 5, | |
| "tuner_type": "gridsearch", | |
| "tuner_early_stopping": 5, | |
| "tuner_num_trials": 50, | |
| "model_info_path": null, | |
| "mp_size": 1, | |
| "max_train_batch_size": null, | |
| "min_train_batch_size": 1, | |
| "max_train_micro_batch_size_per_gpu": 1.024000e+03, | |
| "min_train_micro_batch_size_per_gpu": 1, | |
| "num_tuning_micro_batch_sizes": 3 | |
| } | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] bfloat16_enabled ............. False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] checkpoint_parallel_write_pipeline False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] checkpoint_tag_validation_enabled True | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] checkpoint_tag_validation_fail False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7fde900e6a10> | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] communication_data_type ...... None | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] curriculum_enabled_legacy .... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] curriculum_params_legacy ..... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] data_efficiency_enabled ...... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] dataloader_drop_last ......... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] disable_allgather ............ False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] dump_state ................... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_enabled ........... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_gas_boundary_resolution 1 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_layer_name ........ bert.encoder.layer | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_layer_num ......... 0 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_max_iter .......... 100 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_stability ......... 1e-06 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_tol ............... 0.01 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] eigenvalue_verbose ........... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] elasticity_enabled ........... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] flops_profiler_config ........ { | |
| "enabled": false, | |
| "recompute_fwd_factor": 0.0, | |
| "profile_step": 1, | |
| "module_depth": -1, | |
| "top_modules": 1, | |
| "detailed": true, | |
| "output_file": null | |
| } | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] fp16_auto_cast ............... False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] fp16_enabled ................. True | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] fp16_master_weights_and_gradients False | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] global_rank .................. 0 | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] grad_accum_dtype ............. None | |
| [2025-01-09 09:57:20,611] [INFO] [config.py:983:print] gradient_accumulation_steps .. 1 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] gradient_clipping ............ 0.0 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] gradient_predivide_factor .... 1.0 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] initial_dynamic_scale ........ 65536 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] load_universal_checkpoint .... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] loss_scale ................... 0 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] memory_breakdown ............. False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] mics_hierarchial_params_gather False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] mics_shard_size .............. -1 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] nebula_config ................ { | |
| "enabled": false, | |
| "persistent_storage_path": null, | |
| "persistent_time_interval": 100, | |
| "num_of_version_in_retention": 2, | |
| "enable_nebula_load": true, | |
| "load_path": null | |
| } | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] optimizer_legacy_fusion ...... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] optimizer_name ............... adam | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] optimizer_params ............. {'lr': 0.00012, 'betas': [0.9, 0.95], 'eps': 1e-08} | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] pld_enabled .................. False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] pld_params ................... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] prescale_gradients ........... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] scheduler_name ............... None | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] scheduler_params ............. None | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] seq_parallel_communication_data_type torch.float32 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] sparse_attention ............. None | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] sparse_gradients_enabled ..... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] steps_per_print .............. 10 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] train_batch_size ............. 8 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] train_micro_batch_size_per_gpu 1 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] use_data_before_expert_parallel_ False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] use_node_local_storage ....... False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] wall_clock_breakdown ......... True | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] weight_quantization_config ... None | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] world_size ................... 8 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] zero_allow_untested_optimizer False | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] zero_enabled ................. True | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] zero_force_ds_cpu_optimizer .. True | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:983:print] zero_optimization_stage ...... 1 | |
| [2025-01-09 09:57:20,612] [INFO] [config.py:969:print_user_config] json = { | |
| "train_batch_size": 8, | |
| "train_micro_batch_size_per_gpu": 1, | |
| "optimizer": { | |
| "type": "Adam", | |
| "params": { | |
| "lr": 0.00012, | |
| "betas": [0.9, 0.95], | |
| "eps": 1e-08 | |
| } | |
| }, | |
| "fp16": { | |
| "fp16": true, | |
| "enabled": true, | |
| "loss_scale": 0, | |
| "loss_scale_window": 1000, | |
| "hysteresis": 2, | |
| "min_loss_scale": 1 | |
| }, | |
| "zero_optimization": { | |
| "stage": 1, | |
| "allgather_partitions": true, | |
| "allgather_bucket_size": 5.000000e+08, | |
| "overlap_comm": true, | |
| "reduce_scatter": true, | |
| "reduce_bucket_size": 5.000000e+08, | |
| "contiguous_gradients": true | |
| }, | |
| "wall_clock_breakdown": true | |
| } | |
| [2025-01-09 09:57:20,612] [INFO] [engine.py:99:__init__] CONFIG: micro_batches=1 micro_batch_size=1 | |
| [2025-01-09 09:57:20,613] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,610] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,617] [INFO] [engine.py:139:__init__] is_pipe_partitioned= True is_grad_partitioned= True | |
| [2025-01-09 09:57:20,781] [INFO] [engine.py:158:__init__] RANK=0 STAGE=0 LAYERS=37 [0, 37) STAGE_PARAMS=3429048320 (3429.048M) TOTAL_PARAMS=6858096640 (6858.097M) UNIQUE_PARAMS=6858096640 (6858.097M) | |
| [2025-01-09 09:57:20,782] [INFO] [engine.py:158:__init__] RANK=1 STAGE=0 LAYERS=37 [0, 37) STAGE_PARAMS=3429048320 (3429.048M) TOTAL_PARAMS=6858096640 (6858.097M) UNIQUE_PARAMS=6858096640 (6858.097M) | |
| > number of parameters on model parallel rank 0: 3429048320 | |
| > number of parameters on model parallel rank 1: 3429048320 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment