Skip to content

Instantly share code, notes, and snippets.

@sujithjoseph
Created February 12, 2023 23:45
Show Gist options
  • Save sujithjoseph/c410514acfccc76974a8130a8afd2169 to your computer and use it in GitHub Desktop.
Save sujithjoseph/c410514acfccc76974a8130a8afd2169 to your computer and use it in GitHub Desktop.
```
[2023-02-12 23:29:27,805] [INFO] [utils.py:831:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[2023-02-12 23:29:27,806] [INFO] [utils.py:836:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.75 GB Max_CA 1 GB
[2023-02-12 23:29:27,806] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.65 GB, percent = 14.8%
Parameter Offload: Total persistent parameters: 9940992 in 412 params
[2023-02-12 23:29:28,163] [INFO] [utils.py:831:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
[2023-02-12 23:29:28,164] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.02 GB CA 0.75 GB Max_CA 1 GB
[2023-02-12 23:29:28,164] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.67 GB, percent = 14.8%
[2023-02-12 23:29:28,284] [INFO] [utils.py:831:see_memory_usage] Before creating fp16 partitions
[2023-02-12 23:29:28,285] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.75 GB Max_CA 1 GB
[2023-02-12 23:29:28,285] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.67 GB, percent = 14.8%
[2023-02-12 23:29:28,794] [INFO] [utils.py:831:see_memory_usage] After creating fp16 partitions: 2
[2023-02-12 23:29:28,795] [INFO] [utils.py:836:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.75 GB Max_CA 1 GB
[2023-02-12 23:29:28,795] [INFO] [utils.py:841:see_memory_usage] CPU Virtual Memory: used = 49.71 GB, percent = 14.9%
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│lft5_cdc.py:471 in <module> │
│ │
│ 468 │
│ 469 │
│ 470 if __name__ == "__main__": │
│ ❱ 471 │ main() │
│ 472 │
│ │
│/lft5_cdc.py:336 in main │
│ │
│ 333 │ │
│ 334 │ │
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │
│ 337 │ ) │
│ 338 │ accelerator.print(model) │
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │
│ prepare │
│ │
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │
│ 941 │ │ │
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │
│ 946 │ │ else: │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │
│ _prepare_deepspeed │
│ │
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │
│ 1172 │ │ │ │
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │
│ 1174 │ │ │ if optimizer is not None: │
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │
│ 1176 │ │ │ if scheduler is not None: │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │
│ initialize │
│ │
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │
│ 134 │ │ │ │ │ │ │ │ config=config, │
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │
│ 136 │ else: │
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │
│ 138 │ │ engine = PipelineEngine(args=args, │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │
│ __init__ │
│ │
│ 337 │ │ │ model_parameters = self.module.parameters() │
│ 338 │ │ │
│ 339 │ │ if has_optimizer: │
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 342 │ │ │ self._report_progress(0) │
│ 343 │ │ elif self.zero_optimization(): │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │
│ _configure_optimizer │
│ │
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1289 │ │ │
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1292 │ │ elif optimizer_wrapper == AMP: │
│ 1293 │ │ │ amp_params = self.amp_params() │
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │
│ _configure_zero_optimizer │
│ │
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │
│ 1622 │ │ │
│ 1623 │ │ else: │
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in __init__ │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in <listcomp> │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
╰──────────────────────────────────────────────────────────────────────────────╯
ValueError: max() arg is an empty sequence
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │
│ │
│ 468 │
│ 469 │
│ 470 if __name__ == "__main__": │
│ ❱ 471 │ main() │
│ 472 │
│ │
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │
│ │
│ 333 │ │
│ 334 │ │
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │
│ 337 │ ) │
│ 338 │ accelerator.print(model) │
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │
│ prepare │
│ │
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │
│ 941 │ │ │
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │
│ 946 │ │ else: │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │
│ _prepare_deepspeed │
│ │
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │
│ 1172 │ │ │ │
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │
│ 1174 │ │ │ if optimizer is not None: │
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │
│ 1176 │ │ │ if scheduler is not None: │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │
│ initialize │
│ │
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │
│ 134 │ │ │ │ │ │ │ │ config=config, │
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │
│ 136 │ else: │
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │
│ 138 │ │ engine = PipelineEngine(args=args, │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │
│ __init__ │
│ │
│ 337 │ │ │ model_parameters = self.module.parameters() │
│ 338 │ │ │
│ 339 │ │ if has_optimizer: │
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 342 │ │ │ self._report_progress(0) │
│ 343 │ │ elif self.zero_optimization(): │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │
│ _configure_optimizer │
│ │
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1289 │ │ │
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1292 │ │ elif optimizer_wrapper == AMP: │
│ 1293 │ │ │ amp_params = self.amp_params() │
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │
│ _configure_zero_optimizer │
│ │
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │
│ 1622 │ │ │
│ 1623 │ │ else: │
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in __init__ │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in <listcomp> │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
╰──────────────────────────────────────────────────────────────────────────────╯
ValueError: max() arg is an empty sequence
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │
│ │
│ 468 │
│ 469 │
│ 470 if __name__ == "__main__": │
│ ❱ 471 │ main() │
│ 472 │
│ │
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │
│ │
│ 333 │ │
│ 334 │ │
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │
│ 337 │ ) │
│ 338 │ accelerator.print(model) │
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │
│ prepare │
│ │
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │
│ 941 │ │ │
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │
│ 946 │ │ else: │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │
│ _prepare_deepspeed │
│ │
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │
│ 1172 │ │ │ │
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │
│ 1174 │ │ │ if optimizer is not None: │
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │
│ 1176 │ │ │ if scheduler is not None: │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │
│ initialize │
│ │
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │
│ 134 │ │ │ │ │ │ │ │ config=config, │
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │
│ 136 │ else: │
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │
│ 138 │ │ engine = PipelineEngine(args=args, │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │
│ __init__ │
│ │
│ 337 │ │ │ model_parameters = self.module.parameters() │
│ 338 │ │ │
│ 339 │ │ if has_optimizer: │
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 342 │ │ │ self._report_progress(0) │
│ 343 │ │ elif self.zero_optimization(): │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │
│ _configure_optimizer │
│ │
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1289 │ │ │
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1292 │ │ elif optimizer_wrapper == AMP: │
│ 1293 │ │ │ amp_params = self.amp_params() │
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │
│ _configure_zero_optimizer │
│ │
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │
│ 1622 │ │ │
│ 1623 │ │ else: │
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in __init__ │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in <listcomp> │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
╰──────────────────────────────────────────────────────────────────────────────╯
ValueError: max() arg is an empty sequence
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/jupyter/t5/flant5/lft5_cdc.py:471 in <module> │
│ │
│ 468 │
│ 469 │
│ 470 if __name__ == "__main__": │
│ ❱ 471 │ main() │
│ 472 │
│ │
│ /home/jupyter/t5/flant5/lft5_cdc.py:336 in main │
│ │
│ 333 │ │
│ 334 │ │
│ 335 │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │
│ ❱ 336 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_schedu │
│ 337 │ ) │
│ 338 │ accelerator.print(model) │
│ 339 │ # accelerator.state.deepspeed_plugin.zero_stage == 3 │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:943 in │
│ prepare │
│ │
│ 940 │ │ │ old_named_params = self._get_named_parameters(*args) │
│ 941 │ │ │
│ 942 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │
│ ❱ 943 │ │ │ result = self._prepare_deepspeed(*args) │
│ 944 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 945 │ │ │ result = self._prepare_megatron_lm(*args) │
│ 946 │ │ else: │
│ │
│ /opt/conda/lib/python3.7/site-packages/accelerate/accelerator.py:1173 in │
│ _prepare_deepspeed │
│ │
│ 1170 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runt │
│ 1171 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │
│ 1172 │ │ │ │
│ ❱ 1173 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize │
│ 1174 │ │ │ if optimizer is not None: │
│ 1175 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │
│ 1176 │ │ │ if scheduler is not None: │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/__init__.py:135 in │
│ initialize │
│ │
│ 132 │ │ │ │ │ │ │ │ dist_init_required=dist_init_required │
│ 133 │ │ │ │ │ │ │ │ collate_fn=collate_fn, │
│ 134 │ │ │ │ │ │ │ │ config=config, │
│ ❱ 135 │ │ │ │ │ │ │ │ config_params=config_params) │
│ 136 │ else: │
│ 137 │ │ assert mpu is None, "mpu must be None with pipeline parallelis │
│ 138 │ │ engine = PipelineEngine(args=args, │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:340 in │
│ __init__ │
│ │
│ 337 │ │ │ model_parameters = self.module.parameters() │
│ 338 │ │ │
│ 339 │ │ if has_optimizer: │
│ ❱ 340 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 341 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 342 │ │ │ self._report_progress(0) │
│ 343 │ │ elif self.zero_optimization(): │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1291 in │
│ _configure_optimizer │
│ │
│ 1288 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1289 │ │ │
│ 1290 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1291 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1292 │ │ elif optimizer_wrapper == AMP: │
│ 1293 │ │ │ amp_params = self.amp_params() │
│ 1294 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py:1621 in │
│ _configure_zero_optimizer │
│ │
│ 1618 │ │ │ │ │ gradient_predivide_factor=self.gradient_predivide │
│ 1619 │ │ │ │ │ gradient_accumulation_steps=self.gradient_accumul │
│ 1620 │ │ │ │ │ aio_config=self.aio_config(), │
│ ❱ 1621 │ │ │ │ │ communication_data_type=self.communication_data_t │
│ 1622 │ │ │
│ 1623 │ │ else: │
│ 1624 │ │ │ raise NotImplementedError("ZeRO stage {} not implemented" │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in __init__ │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
│ │
│ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py:307 │
│ in <listcomp> │
│ │
│ 304 │ │ │ max([ │
│ 305 │ │ │ │ max(tensor.numel(), │
│ 306 │ │ │ │ │ tensor.ds_numel) for tensor in fp16_partitioned_g │
│ ❱ 307 │ │ │ ]) for fp16_partitioned_group in self.fp16_partitioned_gr │
│ 308 │ │ ]) │
│ 309 │ │ print_rank_0( │
│ 310 │ │ │ f'Largest partitioned param numel = {largest_partitioned_ │
╰──────────────────────────────────────────────────────────────────────────────╯
ValueError: max() arg is an empty sequence
[23:29:34] WARNING Sending process 44929 closing signal SIGTERM ]8;id=166210;file:///opt/conda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/api.py\api.py]8;;\:]8;id=536920;file:///opt/conda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/api.py#700\700]8;;\
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment