Skip to content

Instantly share code, notes, and snippets.

@yujiepan-work
Last active February 17, 2023 07:04
Show Gist options
  • Save yujiepan-work/964d4716902ee75bf132dc4d80c96e61 to your computer and use it in GitHub Desktop.
Save yujiepan-work/964d4716902ee75bf132dc4d80c96e61 to your computer and use it in GitHub Desktop.
DP "not same device" crash after onnx export
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import nncf
from nncf.config import NNCFConfig
from nncf.config.structures import (BNAdaptationInitArgs,
QuantizationRangeInitArgs)
from nncf.torch import create_compressed_model
from nncf.torch.initialization import PTInitializingDataLoader
class MyModel(nn.Module):
def __init__(self) -> None:
super().__init__()
self.linear = nn.Linear(4, 1)
def forward(self, x):
return self.linear(x)
class MyDataset(Dataset):
def __len__(self):
return 100
def __getitem__(self, i):
return {'x': torch.rand(4)}
class MyInitializingDataloader(PTInitializingDataLoader):
def get_inputs(self, dataloader_output):
return (), dataloader_output
dataset = MyDataset()
train_dataloader = DataLoader(dataset, batch_size=8)
nncf_config = NNCFConfig.from_dict({
"input_info": [
{"sample_size": [1, 4],
"type": "float"
}
],
"compression": [
{
"algorithm": "quantization"
}
]
})
nncf_config.register_extra_structs([
QuantizationRangeInitArgs(MyInitializingDataloader(train_dataloader)),
BNAdaptationInitArgs(MyInitializingDataloader(train_dataloader)),
])
model = MyModel()
compression_ctrl, compressed_model = create_compressed_model(model, nncf_config)
compressed_model_dp = nn.DataParallel(compressed_model).cuda()
optimizer = optim.SGD(params=compressed_model_dp.parameters(), lr=1e-3)
for epoch in range(3):
print(f'Start training epoch {epoch}')
compressed_model_dp = compressed_model_dp.train()
compression_ctrl.scheduler.epoch_step()
for batch in train_dataloader:
x_cuda = {k: v.cuda() for k, v in batch.items()}
compression_ctrl.scheduler.step()
output = compressed_model_dp(**x_cuda)
output.sum().backward()
optimizer.step()
print(f'Exporting to onnx after training epoch {epoch}')
onnx_path = '/tmp/debug.onnx'
if os.path.exists(onnx_path):
os.remove(onnx_path)
compression_ctrl.export_model(onnx_path)
assert os.path.isfile(onnx_path)
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx
WARNING:nncf:Enabling quantization range initialization with default parameters.
INFO:nncf:Collecting tensor statistics |█ | 3 / 32
INFO:nncf:Collecting tensor statistics |███ | 6 / 32
INFO:nncf:Collecting tensor statistics |████ | 9 / 32
INFO:nncf:Collecting tensor statistics |██████ | 12 / 32
INFO:nncf:Compiling and loading torch extension: quantized_functions_cpu...
INFO:nncf:Finished loading torch extension: quantized_functions_cpu
Start training epoch 0
WARNING:nncf:You are using DataParallel, which may cause significant performance issues with dynamic graph building. Consider using distributed training (DistributedDataParallel) instead.
INFO:nncf:Compiling and loading torch extension: quantized_functions_cuda...
INFO:nncf:Compiling and loading torch extension: quantized_functions_cuda...
INFO:nncf:Finished loading torch extension: quantized_functions_cuda
INFO:nncf:Finished loading torch extension: quantized_functions_cuda
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/quantization/quantize_functions.py:141: FutureWarning: 'torch.onnx._patch_torch._graph_op' is deprecated in version 1.13 and will be removed in version 1.14. Please note 'g.op()' is to be removed from torch.Graph. Please open a GitHub issue if you need this functionality..
output = g.op(
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/_patch_torch.py:81: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.)
_C._jit_pass_onnx_node_shape_type_inference(
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/utils.py:687: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.)
_C._jit_pass_onnx_graph_shape_type_inference(
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/utils.py:1178: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.)
_C._jit_pass_onnx_graph_shape_type_inference(
Exporting to onnx after training epoch 0
Start training epoch 1
Traceback (most recent call last):
File "debug_dp.py", line 69, in <module>
output = compressed_model_dp(**x_cuda)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 144, in wrapped
return module_call(self, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 144, in wrapped
return module_call(self, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/debug.py", line 68, in decorated
retval = forward_func(self, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/nncf_network.py", line 253, in forward
retval = self.get_nncf_wrapped_model()(*args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 164, in wrapped
retval = module_call(self, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "debug_dp.py", line 22, in forward
return self.linear(x)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 164, in wrapped
retval = module_call(self, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/layer_utils.py", line 78, in forward
results = forward_fn(proxy_module, *args)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 108, in wrapped
result = _execute_op(op_address, operator_info, operator, ctx, *args, **kwargs)
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 186, in _execute_op
result = operator(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat1 in method wrapper_addmm)
attrs==22.2.0
autograd==1.5
autopep8==1.6.0
certifi @ file:///croot/certifi_1671487769961/work/certifi
charset-normalizer==3.0.1
cma==2.7.0
coloredlogs==15.0.1
contourpy==1.0.7
cycler==0.11.0
flatbuffers==23.1.21
fonttools==4.38.0
future==0.18.3
humanfriendly==10.0
idna==3.4
importlib-resources==5.10.2
joblib==1.2.0
jsonschema==4.17.3
jstyleson==0.0.2
kiwisolver==1.4.4
matplotlib==3.7.0
mpmath==1.2.1
natsort==8.2.0
networkx==2.8.2
ninja==1.10.2.4
nncf @ git+https://github.com/openvinotoolkit/nncf@06bd55537abd6bc804a38372155e08d926d731ca
numpy==1.23.5
onnx==1.12.0
onnxruntime==1.13.1
openvino-telemetry==2022.3.0
packaging==23.0
pandas==1.5.2
Pillow==9.4.0
pkgutil_resolve_name==1.3.10
protobuf==3.20.1
pydot==1.4.2
pymoo==0.5.0
pyparsing==2.4.7
pyrsistent==0.19.3
python-dateutil==2.8.2
pytz==2022.7.1
requests==2.28.2
scikit-learn==1.2.1
scipy==1.10.0
six==1.16.0
sympy==1.11.1
texttable==1.6.7
threadpoolctl==3.1.0
toml==0.10.2
torch==1.13.1+cu116
torchaudio==0.13.1+cu116
torchvision==0.14.1+cu116
tqdm==4.64.1
typing_extensions==4.5.0
urllib3==1.26.14
zipp==3.13.0
@yujiepan-work
Copy link
Author

yujiepan-work commented Feb 17, 2023

Environment: NNCF 06bd55537abd6bc804a38372155e08d926d731ca
Run:

CUDA_VISIBLE_DEVICES=0,1 python debug_dp.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment