-
-
Save sujithjoseph/c410514acfccc76974a8130a8afd2169 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` | |
[2023-02-12 23:29:27,805] [INFO] [utils.py:831:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] | |
[2023-02-12 23:29:27,806] [INFO] [utils.py:836:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.75 GB Max_CA 1 GB | |
[2023-02-12 23:29:27,806] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.65 GB, percent = 14.8% | |
Parameter Offload: Total persistent parameters: 9940992 in 412 params | |
[2023-02-12 23:29:28,163] [INFO] [utils.py:831:see_memory_usage] DeepSpeedZeRoOffload initialize [end] | |
[2023-02-12 23:29:28,164] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.02 GB CA 0.75 GB Max_CA 1 GB | |
[2023-02-12 23:29:28,164] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.67 GB, percent = 14.8% | |
[2023-02-12 23:29:28,284] [INFO] [utils.py:831:see_memory_usage] Before creating fp16 partitions | |
[2023-02-12 23:29:28,285] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.75 GB Max_CA 1 GB | |
[2023-02-12 23:29:28,285] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.67 GB, percent = 14.8% | |
[2023-02-12 23:29:28,794] [INFO] [utils.py:831:see_memory_usage] After creating fp16 partitions: 2 | |
[2023-02-12 23:29:28,795] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.75 GB Max_CA 1 GB | |
[2023-02-12 23:29:28,795] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.71 GB, percent = 14.9% | |
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ | |
│lft5_cdc.py:471 in <module> │ | |
│ │ | |
│ 468 │ | |
│ 469 │ | |
│ 470 if __name__ == "__main__": │ | |
│ ❱ 471 │ main() │ | |
│ 472 │ | |
│ │ | |
│/lft5_cdc.py:336 in main │ | |
│ │ | |
│ 333 │ │ | |
│ 334 │ │ | |
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │ | |
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │ | |
│ 337 │ ) │ | |
│ 338 │ accelerator.print(model) │ | |
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │ | |
│ prepare │ | |
│ │ | |
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │ | |
│ 941 │ │ │ | |
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │ | |
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │ | |
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │ | |
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │ | |
│ 946 │ │ else: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │ | |
│ _prepare_deepspeed │ | |
│ │ | |
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │ | |
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │ | |
│ 1172 │ │ │ │ | |
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │ | |
│ 1174 │ │ │ if optimizer is not None: │ | |
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │ | |
│ 1176 │ │ │ if scheduler is not None: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │ | |
│ initialize │ | |
│ │ | |
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │ | |
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │ | |
│ 134 │ │ │ │ │ │ │ │ config=config, │ | |
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │ | |
│ 136 │ else: │ | |
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │ | |
│ 138 │ │ engine = PipelineEngine(args=args, │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │ | |
│ __init__ │ | |
│ │ | |
│ 337 │ │ │ model_parameters = self.module.parameters() │ | |
│ 338 │ │ │ | |
│ 339 │ │ if has_optimizer: │ | |
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │ | |
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │ | |
│ 342 │ │ │ self._report_progress(0) │ | |
│ 343 │ │ elif self.zero_optimization(): │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │ | |
│ _configure_optimizer │ | |
│ │ | |
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │ | |
│ 1289 │ │ │ | |
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │ | |
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │ | |
│ 1292 │ │ elif optimizer_wrapper == AMP: │ | |
│ 1293 │ │ │ amp_params = self.amp_params() │ | |
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │ | |
│ _configure_zero_optimizer │ | |
│ │ | |
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │ | |
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │ | |
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │ | |
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │ | |
│ 1622 │ │ │ | |
│ 1623 │ │ else: │ | |
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in __init__ │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in <listcomp> │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
╰──────────────────────────────────────────────────────────────────────────────╯ | |
ValueError: max() arg is an empty sequence | |
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │ | |
│ │ | |
│ 468 │ | |
│ 469 │ | |
│ 470 if __name__ == "__main__": │ | |
│ ❱ 471 │ main() │ | |
│ 472 │ | |
│ │ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │ | |
│ │ | |
│ 333 │ │ | |
│ 334 │ │ | |
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │ | |
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │ | |
│ 337 │ ) │ | |
│ 338 │ accelerator.print(model) │ | |
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │ | |
│ prepare │ | |
│ │ | |
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │ | |
│ 941 │ │ │ | |
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │ | |
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │ | |
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │ | |
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │ | |
│ 946 │ │ else: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │ | |
│ _prepare_deepspeed │ | |
│ │ | |
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │ | |
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │ | |
│ 1172 │ │ │ │ | |
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │ | |
│ 1174 │ │ │ if optimizer is not None: │ | |
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │ | |
│ 1176 │ │ │ if scheduler is not None: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │ | |
│ initialize │ | |
│ │ | |
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │ | |
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │ | |
│ 134 │ │ │ │ │ │ │ │ config=config, │ | |
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │ | |
│ 136 │ else: │ | |
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │ | |
│ 138 │ │ engine = PipelineEngine(args=args, │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │ | |
│ __init__ │ | |
│ │ | |
│ 337 │ │ │ model_parameters = self.module.parameters() │ | |
│ 338 │ │ │ | |
│ 339 │ │ if has_optimizer: │ | |
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │ | |
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │ | |
│ 342 │ │ │ self._report_progress(0) │ | |
│ 343 │ │ elif self.zero_optimization(): │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │ | |
│ _configure_optimizer │ | |
│ │ | |
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │ | |
│ 1289 │ │ │ | |
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │ | |
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │ | |
│ 1292 │ │ elif optimizer_wrapper == AMP: │ | |
│ 1293 │ │ │ amp_params = self.amp_params() │ | |
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │ | |
│ _configure_zero_optimizer │ | |
│ │ | |
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │ | |
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │ | |
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │ | |
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │ | |
│ 1622 │ │ │ | |
│ 1623 │ │ else: │ | |
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in __init__ │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in <listcomp> │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
╰──────────────────────────────────────────────────────────────────────────────╯ | |
ValueError: max() arg is an empty sequence | |
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │ | |
│ │ | |
│ 468 │ | |
│ 469 │ | |
│ 470 if __name__ == "__main__": │ | |
│ ❱ 471 │ main() │ | |
│ 472 │ | |
│ │ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │ | |
│ │ | |
│ 333 │ │ | |
│ 334 │ │ | |
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │ | |
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │ | |
│ 337 │ ) │ | |
│ 338 │ accelerator.print(model) │ | |
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │ | |
│ prepare │ | |
│ │ | |
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │ | |
│ 941 │ │ │ | |
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │ | |
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │ | |
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │ | |
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │ | |
│ 946 │ │ else: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │ | |
│ _prepare_deepspeed │ | |
│ │ | |
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │ | |
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │ | |
│ 1172 │ │ │ │ | |
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │ | |
│ 1174 │ │ │ if optimizer is not None: │ | |
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │ | |
│ 1176 │ │ │ if scheduler is not None: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │ | |
│ initialize │ | |
│ │ | |
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │ | |
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │ | |
│ 134 │ │ │ │ │ │ │ │ config=config, │ | |
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │ | |
│ 136 │ else: │ | |
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │ | |
│ 138 │ │ engine = PipelineEngine(args=args, │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │ | |
│ __init__ │ | |
│ │ | |
│ 337 │ │ │ model_parameters = self.module.parameters() │ | |
│ 338 │ │ │ | |
│ 339 │ │ if has_optimizer: │ | |
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │ | |
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │ | |
│ 342 │ │ │ self._report_progress(0) │ | |
│ 343 │ │ elif self.zero_optimization(): │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │ | |
│ _configure_optimizer │ | |
│ │ | |
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │ | |
│ 1289 │ │ │ | |
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │ | |
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │ | |
│ 1292 │ │ elif optimizer_wrapper == AMP: │ | |
│ 1293 │ │ │ amp_params = self.amp_params() │ | |
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │ | |
│ _configure_zero_optimizer │ | |
│ │ | |
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │ | |
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │ | |
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │ | |
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │ | |
│ 1622 │ │ │ | |
│ 1623 │ │ else: │ | |
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in __init__ │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in <listcomp> │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
╰──────────────────────────────────────────────────────────────────────────────╯ | |
ValueError: max() arg is an empty sequence | |
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │ | |
│ │ | |
│ 468 │ | |
│ 469 │ | |
│ 470 if __name__ == "__main__": │ | |
│ ❱ 471 │ main() │ | |
│ 472 │ | |
│ │ | |
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │ | |
│ │ | |
│ 333 │ │ | |
│ 334 │ │ | |
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │ | |
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │ | |
│ 337 │ ) │ | |
│ 338 │ accelerator.print(model) │ | |
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │ | |
│ prepare │ | |
│ │ | |
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │ | |
│ 941 │ │ │ | |
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │ | |
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │ | |
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │ | |
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │ | |
│ 946 │ │ else: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │ | |
│ _prepare_deepspeed │ | |
│ │ | |
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │ | |
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │ | |
│ 1172 │ │ │ │ | |
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │ | |
│ 1174 │ │ │ if optimizer is not None: │ | |
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │ | |
│ 1176 │ │ │ if scheduler is not None: │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │ | |
│ initialize │ | |
│ │ | |
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │ | |
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │ | |
│ 134 │ │ │ │ │ │ │ │ config=config, │ | |
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │ | |
│ 136 │ else: │ | |
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │ | |
│ 138 │ │ engine = PipelineEngine(args=args, │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │ | |
│ __init__ │ | |
│ │ | |
│ 337 │ │ │ model_parameters = self.module.parameters() │ | |
│ 338 │ │ │ | |
│ 339 │ │ if has_optimizer: │ | |
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │ | |
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │ | |
│ 342 │ │ │ self._report_progress(0) │ | |
│ 343 │ │ elif self.zero_optimization(): │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │ | |
│ _configure_optimizer │ | |
│ │ | |
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │ | |
│ 1289 │ │ │ | |
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │ | |
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │ | |
│ 1292 │ │ elif optimizer_wrapper == AMP: │ | |
│ 1293 │ │ │ amp_params = self.amp_params() │ | |
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │ | |
│ _configure_zero_optimizer │ | |
│ │ | |
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │ | |
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │ | |
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │ | |
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │ | |
│ 1622 │ │ │ | |
│ 1623 │ │ else: │ | |
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in __init__ │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
│ │ | |
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │ | |
│ in <listcomp> │ | |
│ │ | |
│ 304 │ │ │ max([ │ | |
│ 305 │ │ │ │ max(tensor.numel(), │ | |
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │ | |
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │ | |
│ 308 │ │ ]) │ | |
│ 309 │ │ print_rank_0( │ | |
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │ | |
╰──────────────────────────────────────────────────────────────────────────────╯ | |
ValueError: max() arg is an empty sequence | |
[23:29:34] WARNING Sending process 44929 closing signal SIGTERM ]8;id=166210;file:///opt/conda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/api.py\api.py]8;;\:]8;id=536920;file:///opt/conda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/api.py#700\700]8;;\ | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment