Last active
April 4, 2019 16:02
-
-
Save stephenroller/bd2cd644e7c117c1ec8192639ecf30b6 to your computer and use it in GitHub Desktop.
OOM recover test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======================================================================= | |
Activating fairseq-fp16-20190211 | |
======================================================================= | |
Running mode=single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/fairseq-fp16-20190211 | |
Torch version: 1.0.0.dev20190211 | |
CUDA version: 10.0.130 | |
Using a single GPU | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Succeeded on the oom batch. | |
Test passed. | |
Running mode=dp | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/fairseq-fp16-20190211 | |
Torch version: 1.0.0.dev20190211 | |
CUDA version: 10.0.130 | |
Wrapping in DataParallel | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
Backward with bs = 65536 | |
FW/BW succeeded. Doubling BS | |
Step bs= 131072 | |
Forward with bs = 131072 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward | |
outputs = self.parallel_apply(replicas, inputs, kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply | |
raise output | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker | |
output = module(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/container.py", line 97, in forward | |
input = module(input) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/activation.py", line 50, in forward | |
return F.threshold(input, self.threshold, self.value, self.inplace) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/functional.py", line 897, in threshold | |
result = _VF.threshold(input, threshold, value) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 14.25 GiB already allocated; 931.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 113, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 142, in forward | |
replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 147, in replicate | |
return replicate(module, device_ids) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate | |
param_copies = Broadcast.apply(devices, *params) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward | |
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced | |
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) | |
RuntimeError: CUDA out of memory. Tried to allocate 64.12 MiB (GPU 1; 15.90 GiB total capacity; 15.13 GiB already allocated; 9.56 MiB free; 911.50 KiB cached) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:236) | |
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7fe2637d2371 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7fe2637d1caa in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #2: <unknown function> + 0x1a2f5 (0x7fe261dcc2f5 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libc10_cuda.so) | |
frame #3: <unknown function> + 0x1ad57 (0x7fe261dccd57 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libc10_cuda.so) | |
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&) + 0x471 (0x7fe27027cc51 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #5: at::CUDAFloatType::empty(c10::ArrayRef<long>, c10::TensorOptions const&) const + 0x161 (0x7fe26eefbae1 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #6: torch::autograd::VariableType::empty(c10::ArrayRef<long>, c10::TensorOptions const&) const + 0x186 (0x7fe2629b6506 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch.so.1) | |
frame #7: torch::cuda::broadcast(at::Tensor const&, c10::ArrayRef<long>) + 0x58d (0x7fe2a6e36f8d in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #8: torch::cuda::broadcast_coalesced(c10::ArrayRef<at::Tensor>, c10::ArrayRef<long>, unsigned long) + 0x6f6 (0x7fe2a6e37b16 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #9: <unknown function> + 0x50aa01 (0x7fe2a6e3ba01 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #10: <unknown function> + 0x1188fe (0x7fe2a6a498fe in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #21: THPFunction_apply(_object*, _object*) + 0x551 (0x7fe2a6c62f41 in /private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #62: __libc_start_main + 0xe7 (0x7fe2bb31bb97 in /lib/x86_64-linux-gnu/libc.so.6) | |
Running mode=ddp_single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/fairseq-fp16-20190211 | |
Torch version: 1.0.0.dev20190211 | |
CUDA version: 10.0.130 | |
Using a single GPU in distributed (equiv to 1 proc per gpu) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Succeeded on the oom batch. | |
Test passed. | |
Running mode=ddp_multi | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/fairseq-fp16-20190211 | |
Torch version: 1.0.0.dev20190211 | |
CUDA version: 10.0.130 | |
Wrapping in DistributedDataParallel (equiv to 1 proc per node) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 107, in run_trial | |
raise rerr | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 360, in forward | |
self._sync_params() | |
File "/private/home/roller/.conda/envs/fairseq-fp16-20190211/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 392, in _sync_params | |
param_data.set_(tensor) | |
RuntimeError: set_storage is not allowed on Tensor created from .data or .detach() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Running mode=single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/kshuster/miniconda3 | |
Torch version: 1.0.0 | |
CUDA version: 9.0.176 | |
Using a single GPU | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Succeeded on the oom batch. | |
Test passed. | |
Running mode=dp | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/kshuster/miniconda3 | |
Torch version: 1.0.0 | |
CUDA version: 9.0.176 | |
Wrapping in DataParallel | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
Backward with bs = 65536 | |
FW/BW succeeded. Doubling BS | |
Step bs= 131072 | |
Forward with bs = 131072 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 92, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 54, in fwbw | |
yhat = model(X) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward | |
outputs = self.parallel_apply(replicas, inputs, kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply | |
raise output | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker | |
output = module(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/activation.py", line 50, in forward | |
return F.threshold(input, self.threshold, self.value, self.inplace) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/functional.py", line 840, in threshold | |
result = _VF.threshold(input, threshold, value) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 14.25 GiB already allocated; 933.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 130, in <module> | |
main() | |
File "memtestcase.py", line 125, in main | |
run_trial(args) | |
File "memtestcase.py", line 104, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 54, in fwbw | |
yhat = model(X) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 142, in forward | |
replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 147, in replicate | |
return replicate(module, device_ids) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate | |
param_copies = Broadcast.apply(devices, *params) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward | |
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced | |
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) | |
RuntimeError: CUDA out of memory. Tried to allocate 64.12 MiB (GPU 1; 15.90 GiB total capacity; 15.13 GiB already allocated; 11.56 MiB free; 911.50 KiB cached) (malloc at /pytorch/aten/src/THC/THCCachingAllocator.cpp:231) | |
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7fe54a71dfe1 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7fe54a71ddfa in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #2: <unknown function> + 0x13cf9c5 (0x7fe4815bc9c5 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #3: <unknown function> + 0x13d077a (0x7fe4815bd77a in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, at::TensorOptions const&) + 0x443 (0x7fe48274fa43 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #5: at::CUDAFloatType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x161 (0x7fe4814d6531 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #6: torch::autograd::VariableType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x179 (0x7fe543222df9 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch.so.1) | |
frame #7: torch::cuda::broadcast(at::Tensor const&, c10::ArrayRef<long>) + 0x545 (0x7fe54ae1bd25 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #8: torch::cuda::broadcast_coalesced(c10::ArrayRef<at::Tensor>, c10::ArrayRef<long>, unsigned long) + 0x7f6 (0x7fe54ae1c9a6 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #9: <unknown function> + 0x4f5c59 (0x7fe54ae20c59 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #10: <unknown function> + 0x116fac (0x7fe54aa41fac in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #21: THPFunction_apply(_object*, _object*) + 0x581 (0x7fe54ac3f4d1 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #62: __libc_start_main + 0xe7 (0x7fe557a1bb97 in /lib/x86_64-linux-gnu/libc.so.6) | |
Running mode=ddp_single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/kshuster/miniconda3 | |
Torch version: 1.0.0 | |
CUDA version: 9.0.176 | |
Using a single GPU in distributed (equiv to 1 proc per gpu) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Succeeded on the oom batch. | |
Test passed. | |
Running mode=ddp_multi | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/kshuster/miniconda3 | |
Torch version: 1.0.0 | |
CUDA version: 9.0.176 | |
Wrapping in DistributedDataParallel (equiv to 1 proc per node) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
Backward with bs = 65536 | |
FW/BW succeeded. Doubling BS | |
Step bs= 131072 | |
Forward with bs = 131072 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 92, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 54, in fwbw | |
yhat = model(X) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 358, in forward | |
outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 365, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply | |
raise output | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker | |
output = module(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/activation.py", line 50, in forward | |
return F.threshold(input, self.threshold, self.value, self.inplace) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/functional.py", line 840, in threshold | |
result = _VF.threshold(input, threshold, value) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 14.25 GiB already allocated; 927.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 130, in <module> | |
main() | |
File "memtestcase.py", line 125, in main | |
run_trial(args) | |
File "memtestcase.py", line 104, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 54, in fwbw | |
yhat = model(X) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 355, in forward | |
self._sync_params() | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 384, in _sync_params | |
self.broadcast_bucket_size) | |
File "/private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced | |
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) | |
RuntimeError: CUDA out of memory. Tried to allocate 128.12 MiB (GPU 1; 15.90 GiB total capacity; 15.13 GiB already allocated; 25.56 MiB free; 992.00 KiB cached) (malloc at /pytorch/aten/src/THC/THCCachingAllocator.cpp:231) | |
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7fde24f7afe1 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7fde24f7adfa in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libc10.so) | |
frame #2: <unknown function> + 0x13cf9c5 (0x7fdd58c029c5 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #3: <unknown function> + 0x13d077a (0x7fdd58c0377a in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, at::TensorOptions const&) + 0x443 (0x7fdd59d95a43 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #5: at::CUDAFloatType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x161 (0x7fdd58b1c531 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #6: torch::autograd::VariableType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x179 (0x7fde0c816df9 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch.so.1) | |
frame #7: torch::cuda::broadcast(at::Tensor const&, c10::ArrayRef<long>) + 0x545 (0x7fde1e413d25 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #8: torch::cuda::broadcast_coalesced(c10::ArrayRef<at::Tensor>, c10::ArrayRef<long>, unsigned long) + 0x7f6 (0x7fde1e4149a6 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #9: <unknown function> + 0x4f5c59 (0x7fde1e418c59 in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
frame #10: <unknown function> + 0x116fac (0x7fde1e039fac in /private/home/kshuster/miniconda3/lib/python3.6/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #52: __libc_start_main + 0xe7 (0x7fde2d814b97 in /lib/x86_64-linux-gnu/libc.so.6) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.distributed as td | |
import torch.nn.parallel as tp | |
START_BS = 8 * 1024 | |
# these don't matter, just constants meant to be a "big" model | |
INPUT_SIZE = 8192 | |
HID_SIZE = 4096 | |
LAYERS = 8 | |
OUT_CLASSES = 4 | |
def wrap_dp(model): | |
return tp.DataParallel(model) | |
def wrap_ddp(model): | |
td.init_process_group( | |
backend='nccl', | |
init_method='tcp://localhost:61337', | |
rank=0, | |
world_size=1 | |
) | |
model = tp.DistributedDataParallel( | |
model, | |
device_ids=None, | |
broadcast_buffers=False, | |
) | |
return model | |
def create_model(args): | |
model = nn.Sequential( | |
nn.Linear(INPUT_SIZE, HID_SIZE), | |
nn.ReLU(), | |
) | |
for i in range(LAYERS): | |
model.add_module('hidd' + str(i), nn.Linear(HID_SIZE, HID_SIZE)) | |
model.add_module('relu' + str(i), nn.ReLU()) | |
model.add_module('output', nn.Linear(HID_SIZE, OUT_CLASSES)) | |
return model | |
def fwbw(model, bs): | |
print(' Forward with bs = {:-6d}'.format(bs)) | |
X = torch.randn(bs, INPUT_SIZE).cuda() | |
torch.cuda.synchronize() | |
yhat = model(X) | |
torch.cuda.synchronize() | |
loss = yhat.sum() | |
torch.cuda.synchronize() | |
print(' Backward with bs = {:-6d}'.format(bs)) | |
loss.backward() | |
torch.cuda.synchronize() | |
model.zero_grad() | |
torch.cuda.synchronize() | |
def run_trial(args): | |
print('Conda PREFIX:', os.environ['CONDA_PREFIX']) | |
print('Torch version:', torch.version.__version__) | |
print('CUDA version:', torch.version.cuda) | |
model = create_model(args).cuda() | |
if args.mode == 'dp': | |
print('Wrapping in DataParallel') | |
model = wrap_dp(model) | |
elif args.mode == 'ddp_multi': | |
print('Wrapping in DistributedDataParallel (equiv to 1 proc per node)') | |
model = wrap_ddp(model) | |
elif args.mode == 'ddp_single': | |
print('Using a single GPU in distributed (equiv to 1 proc per gpu)') | |
torch.cuda.set_device(0) | |
elif args.mode == 'single': | |
print('Using a single GPU') | |
pass | |
else: | |
raise ValueError('--mode wrong') | |
bs = args.bs | |
times_oomed = 0 | |
while times_oomed < args.ooms: | |
# continuously double the batch size until we OOM | |
try: | |
print('Step bs=', bs) | |
fwbw(model, bs) | |
print('FW/BW succeeded. Doubling BS') | |
bs *= 2 | |
except RuntimeError as rerr: | |
if 'memory' not in str(rerr): | |
# not the exception we wanted | |
raise rerr | |
# okay, we found the memory error! Now try to run a NOOP pass | |
# for DDP nodes. Production example here: | |
# https://github.com/pytorch/fairseq/blob/3658fa329b8cb987d951b2e38ec86c44b9e1fea5/fairseq/trainer.py#L361-L368 | |
times_oomed += 1 | |
print('OOM #{}! Running through a tiny batch to catch up worker'.format(times_oomed)) | |
fwbw(model, 2) | |
print('Succeeded on the oom batch.') | |
# start the doubling procedure again | |
bs = args.bs | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--mode', default='ddp', choices=('dp', 'ddp_multi', 'ddp_single', 'single'), | |
help='DataParallel, DistributedDataParallel, or single gpu' | |
) | |
parser.add_argument( | |
'--ooms', default=1, type=int, | |
help='Number of times to OOM' | |
) | |
parser.add_argument( | |
'--bs', default=START_BS, type=int, | |
help='Initial batch size', | |
) | |
args = parser.parse_args() | |
run_trial(args) | |
print('Test passed.') | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
for mode in single dp ddp_single ddp_multi | |
do | |
echo "Running mode=$mode" | |
echo "------------------------------------------------------------" | |
python -u memtestcase.py --mode=$mode 2>&1 | |
echo | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
nvidia-smi | |
. /public/apps/anaconda3/5.0.1/etc/profile.d/conda.sh | |
echo "=======================================================================" | |
echo "Activating fairseq-fp16-20190211" | |
echo "=======================================================================" | |
conda deactivate | |
conda activate fairseq-fp16-20190211 | |
for mode in single dp ddp_single ddp_multi | |
do | |
echo "Running mode=$mode" | |
echo "------------------------------------------------------------" | |
python -u memtestcase.py --mode=$mode 2>&1 | |
echo | |
done | |
echo | |
echo "=======================================================================" | |
echo "Activating pytorch stable" | |
echo "=======================================================================" | |
conda deactivate | |
conda activate retry-20190211 | |
for mode in single dp ddp_single ddp_multi | |
do | |
echo "Running mode=$mode" | |
echo "------------------------------------------------------------" | |
python -u memtestcase.py --mode=$mode 2>&1 | |
echo | |
done | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======================================================================= | |
Activating pytorch stable | |
======================================================================= | |
Running mode=single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/retry-20190211 | |
Torch version: 1.0.1.post2 | |
CUDA version: 10.0.130 | |
Using a single GPU | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 67, in forward | |
return F.linear(input, self.weight, self.bias) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/functional.py", line 1352, in linear | |
ret = torch.addmm(torch.jit._unwrap_optional(bias), input, weight.t()) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 15.25 GiB already allocated; 25.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 113, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 68, in fwbw | |
loss.backward() | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/tensor.py", line 102, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/autograd/__init__.py", line 90, in backward | |
allow_unreachable=True) # allow_unreachable flag | |
RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 15.90 GiB total capacity; 15.25 GiB already allocated; 25.56 MiB free; 989.50 KiB cached) | |
Running mode=dp | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/retry-20190211 | |
Torch version: 1.0.1.post2 | |
CUDA version: 10.0.130 | |
Wrapping in DataParallel | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
Backward with bs = 65536 | |
FW/BW succeeded. Doubling BS | |
Step bs= 131072 | |
Forward with bs = 131072 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward | |
outputs = self.parallel_apply(replicas, inputs, kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply | |
raise output | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker | |
output = module(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/activation.py", line 50, in forward | |
return F.threshold(input, self.threshold, self.value, self.inplace) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/functional.py", line 840, in threshold | |
result = _VF.threshold(input, threshold, value) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 14.25 GiB already allocated; 997.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 113, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 142, in forward | |
replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 147, in replicate | |
return replicate(module, device_ids) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate | |
param_copies = Broadcast.apply(devices, *params) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/_functions.py", line 21, in forward | |
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced | |
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) | |
RuntimeError: CUDA out of memory. Tried to allocate 64.12 MiB (GPU 1; 15.90 GiB total capacity; 15.19 GiB already allocated; 9.56 MiB free; 911.50 KiB cached) (malloc at /opt/conda/conda-bld/pytorch_1549636813070/work/aten/src/THC/THCCachingAllocator.cpp:231) | |
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x45 (0x7fe27c805cf5 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libc10.so) | |
frame #1: <unknown function> + 0x1239bc1 (0x7fe280ae7bc1 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #2: <unknown function> + 0x123a53a (0x7fe280ae853a in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, at::TensorOptions const&) + 0x2d6 (0x7fe282152db6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #4: at::CUDAFloatType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x161 (0x7fe280a06311 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #5: torch::autograd::VariableType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x179 (0x7fe275a3e209 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch.so.1) | |
frame #6: torch::cuda::broadcast(at::Tensor const&, c10::ArrayRef<long>) + 0x545 (0x7fe2a3ed7725 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #7: torch::cuda::broadcast_coalesced(c10::ArrayRef<at::Tensor>, c10::ArrayRef<long>, unsigned long) + 0x7e6 (0x7fe2a3ed8396 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #8: <unknown function> + 0x4f2be6 (0x7fe2a3edcbe6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #9: <unknown function> + 0x111af6 (0x7fe2a3afbaf6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #18: THPFunction_apply(_object*, _object*) + 0x5a1 (0x7fe2a3cf7061 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #51: __libc_start_main + 0xe7 (0x7fe2b52adb97 in /lib/x86_64-linux-gnu/libc.so.6) | |
Running mode=ddp_single | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/retry-20190211 | |
Torch version: 1.0.1.post2 | |
CUDA version: 10.0.130 | |
Using a single GPU in distributed (equiv to 1 proc per gpu) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Backward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 67, in forward | |
return F.linear(input, self.weight, self.bias) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/functional.py", line 1352, in linear | |
ret = torch.addmm(torch.jit._unwrap_optional(bias), input, weight.t()) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 15.25 GiB already allocated; 25.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 113, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 68, in fwbw | |
loss.backward() | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/tensor.py", line 102, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/autograd/__init__.py", line 90, in backward | |
allow_unreachable=True) # allow_unreachable flag | |
RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 15.90 GiB total capacity; 15.25 GiB already allocated; 25.56 MiB free; 989.50 KiB cached) | |
Running mode=ddp_multi | |
------------------------------------------------------------ | |
Conda PREFIX: /private/home/roller/.conda/envs/retry-20190211 | |
Torch version: 1.0.1.post2 | |
CUDA version: 10.0.130 | |
Wrapping in DistributedDataParallel (equiv to 1 proc per node) | |
Step bs= 8192 | |
Forward with bs = 8192 | |
Backward with bs = 8192 | |
FW/BW succeeded. Doubling BS | |
Step bs= 16384 | |
Forward with bs = 16384 | |
Backward with bs = 16384 | |
FW/BW succeeded. Doubling BS | |
Step bs= 32768 | |
Forward with bs = 32768 | |
Backward with bs = 32768 | |
FW/BW succeeded. Doubling BS | |
Step bs= 65536 | |
Forward with bs = 65536 | |
Backward with bs = 65536 | |
FW/BW succeeded. Doubling BS | |
Step bs= 131072 | |
Forward with bs = 131072 | |
OOM #1! Running through a tiny batch to catch up worker | |
Forward with bs = 2 | |
Traceback (most recent call last): | |
File "memtestcase.py", line 101, in run_trial | |
fwbw(model, bs) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 358, in forward | |
outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 365, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply | |
raise output | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker | |
output = module(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward | |
input = module(input) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/activation.py", line 50, in forward | |
return F.threshold(input, self.threshold, self.value, self.inplace) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/functional.py", line 840, in threshold | |
result = _VF.threshold(input, threshold, value) | |
RuntimeError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 15.90 GiB total capacity; 14.25 GiB already allocated; 991.56 MiB free; 607.00 KiB cached) | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "memtestcase.py", line 139, in <module> | |
main() | |
File "memtestcase.py", line 134, in main | |
run_trial(args) | |
File "memtestcase.py", line 113, in run_trial | |
fwbw(model, 2) | |
File "memtestcase.py", line 63, in fwbw | |
yhat = model(X) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__ | |
result = self.forward(*input, **kwargs) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 355, in forward | |
self._sync_params() | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 384, in _sync_params | |
self.broadcast_bucket_size) | |
File "/private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced | |
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) | |
RuntimeError: CUDA out of memory. Tried to allocate 128.12 MiB (GPU 1; 15.90 GiB total capacity; 15.13 GiB already allocated; 89.56 MiB free; 992.00 KiB cached) (malloc at /opt/conda/conda-bld/pytorch_1549636813070/work/aten/src/THC/THCCachingAllocator.cpp:231) | |
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x45 (0x7ff7535bbcf5 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libc10.so) | |
frame #1: <unknown function> + 0x1239bc1 (0x7ff75789dbc1 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #2: <unknown function> + 0x123a53a (0x7ff75789e53a in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, at::TensorOptions const&) + 0x2d6 (0x7ff758f08db6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #4: at::CUDAFloatType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x161 (0x7ff7577bc311 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so) | |
frame #5: torch::autograd::VariableType::empty(c10::ArrayRef<long>, at::TensorOptions const&) const + 0x179 (0x7ff74c7f4209 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch.so.1) | |
frame #6: torch::cuda::broadcast(at::Tensor const&, c10::ArrayRef<long>) + 0x545 (0x7ff77ac8d725 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #7: torch::cuda::broadcast_coalesced(c10::ArrayRef<at::Tensor>, c10::ArrayRef<long>, unsigned long) + 0x7e6 (0x7ff77ac8e396 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #8: <unknown function> + 0x4f2be6 (0x7ff77ac92be6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #9: <unknown function> + 0x111af6 (0x7ff77a8b1af6 in /private/home/roller/.conda/envs/retry-20190211/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #43: __libc_start_main + 0xe7 (0x7ff78c063b97 in /lib/x86_64-linux-gnu/libc.so.6) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment