Created
May 17, 2021 19:18
-
-
Save exelents/dd64ddd745bfa732a809a6b3e9af678d to your computer and use it in GitHub Desktop.
cpu offload error
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2021-05-17 22:17:54,087] [INFO] [stage2.py:130:__init__] Reduce bucket size 50000000 | |
[2021-05-17 22:17:54,088] [INFO] [stage2.py:131:__init__] Allgather bucket size 200000000 | |
[2021-05-17 22:17:54,088] [INFO] [stage2.py:132:__init__] CPU Offload: True | |
Traceback (most recent call last): | |
File "../pretrain_gpt3.py", line 832, in <module> | |
main() | |
File "../pretrain_gpt3.py", line 788, in main | |
model, optimizer, lr_scheduler = setup_model_and_optimizer(args) | |
File "../pretrain_gpt3.py", line 190, in setup_model_and_optimizer | |
dist_init_required=False | |
File "/export/DeepSpeed-triton3/deepspeed/__init__.py", line 125, in initialize | |
config_params=config_params) | |
File "/export/DeepSpeed-triton3/deepspeed/runtime/engine.py", line 186, in __init__ | |
self._configure_optimizer(optimizer, model_parameters) | |
File "/export/DeepSpeed-triton3/deepspeed/runtime/engine.py", line 624, in _configure_optimizer | |
self.optimizer = self._configure_zero_optimizer(basic_optimizer) | |
File "/export/DeepSpeed-triton3/deepspeed/runtime/engine.py", line 775, in _configure_zero_optimizer | |
gradient_accumulation_steps=self.gradient_accumulation_steps()) | |
File "/export/DeepSpeed-triton3/deepspeed/runtime/zero/stage2.py", line 395, in __init__ | |
self.initialize_optimizer_states() | |
File "/export/DeepSpeed-triton3/deepspeed/runtime/zero/stage2.py", line 421, in initialize_optimizer_states | |
self.optimizer.step() | |
File "/home/fellow/.virtualenvs/rugpt37/lib/python3.7/site-packages/torch/optim/optimizer.py", line 89, in wrapper | |
return func(*args, **kwargs) | |
File "/home/fellow/.virtualenvs/rugpt37/lib/python3.7/site-packages/apex/optimizers/fused_adam.py", line 169, in step | |
group['weight_decay']) | |
File "/home/fellow/.virtualenvs/rugpt37/lib/python3.7/site-packages/apex/multi_tensor_apply/multi_tensor_apply.py", line 30, in __call__ | |
*args) | |
RuntimeError: expected input to be on cuda | |
-------------------------------------------------------------------------- | |
Primary job terminated normally, but 1 process returned | |
a non-zero exit code. Per user-direction, the job has been aborted. | |
-------------------------------------------------------------------------- | |
-------------------------------------------------------------------------- | |
mpirun detected that one or more processes exited with non-zero status, thus causing | |
the job to be terminated. The first process to do so was: | |
Process name: [[24721,1],0] | |
Exit code: 1 | |
-------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment