Last active
February 17, 2023 07:04
-
-
Save yujiepan-work/964d4716902ee75bf132dc4d80c96e61 to your computer and use it in GitHub Desktop.
DP "not same device" crash after onnx export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
from torch.utils.data import DataLoader, Dataset | |
import nncf | |
from nncf.config import NNCFConfig | |
from nncf.config.structures import (BNAdaptationInitArgs, | |
QuantizationRangeInitArgs) | |
from nncf.torch import create_compressed_model | |
from nncf.torch.initialization import PTInitializingDataLoader | |
class MyModel(nn.Module): | |
def __init__(self) -> None: | |
super().__init__() | |
self.linear = nn.Linear(4, 1) | |
def forward(self, x): | |
return self.linear(x) | |
class MyDataset(Dataset): | |
def __len__(self): | |
return 100 | |
def __getitem__(self, i): | |
return {'x': torch.rand(4)} | |
class MyInitializingDataloader(PTInitializingDataLoader): | |
def get_inputs(self, dataloader_output): | |
return (), dataloader_output | |
dataset = MyDataset() | |
train_dataloader = DataLoader(dataset, batch_size=8) | |
nncf_config = NNCFConfig.from_dict({ | |
"input_info": [ | |
{"sample_size": [1, 4], | |
"type": "float" | |
} | |
], | |
"compression": [ | |
{ | |
"algorithm": "quantization" | |
} | |
] | |
}) | |
nncf_config.register_extra_structs([ | |
QuantizationRangeInitArgs(MyInitializingDataloader(train_dataloader)), | |
BNAdaptationInitArgs(MyInitializingDataloader(train_dataloader)), | |
]) | |
model = MyModel() | |
compression_ctrl, compressed_model = create_compressed_model(model, nncf_config) | |
compressed_model_dp = nn.DataParallel(compressed_model).cuda() | |
optimizer = optim.SGD(params=compressed_model_dp.parameters(), lr=1e-3) | |
for epoch in range(3): | |
print(f'Start training epoch {epoch}') | |
compressed_model_dp = compressed_model_dp.train() | |
compression_ctrl.scheduler.epoch_step() | |
for batch in train_dataloader: | |
x_cuda = {k: v.cuda() for k, v in batch.items()} | |
compression_ctrl.scheduler.step() | |
output = compressed_model_dp(**x_cuda) | |
output.sum().backward() | |
optimizer.step() | |
print(f'Exporting to onnx after training epoch {epoch}') | |
onnx_path = '/tmp/debug.onnx' | |
if os.path.exists(onnx_path): | |
os.remove(onnx_path) | |
compression_ctrl.export_model(onnx_path) | |
assert os.path.isfile(onnx_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx | |
WARNING:nncf:Enabling quantization range initialization with default parameters. | |
INFO:nncf:Collecting tensor statistics |█ | 3 / 32 | |
INFO:nncf:Collecting tensor statistics |███ | 6 / 32 | |
INFO:nncf:Collecting tensor statistics |████ | 9 / 32 | |
INFO:nncf:Collecting tensor statistics |██████ | 12 / 32 | |
INFO:nncf:Compiling and loading torch extension: quantized_functions_cpu... | |
INFO:nncf:Finished loading torch extension: quantized_functions_cpu | |
Start training epoch 0 | |
WARNING:nncf:You are using DataParallel, which may cause significant performance issues with dynamic graph building. Consider using distributed training (DistributedDataParallel) instead. | |
INFO:nncf:Compiling and loading torch extension: quantized_functions_cuda... | |
INFO:nncf:Compiling and loading torch extension: quantized_functions_cuda... | |
INFO:nncf:Finished loading torch extension: quantized_functions_cuda | |
INFO:nncf:Finished loading torch extension: quantized_functions_cuda | |
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/quantization/quantize_functions.py:141: FutureWarning: 'torch.onnx._patch_torch._graph_op' is deprecated in version 1.13 and will be removed in version 1.14. Please note 'g.op()' is to be removed from torch.Graph. Please open a GitHub issue if you need this functionality.. | |
output = g.op( | |
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/_patch_torch.py:81: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.) | |
_C._jit_pass_onnx_node_shape_type_inference( | |
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/utils.py:687: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.) | |
_C._jit_pass_onnx_graph_shape_type_inference( | |
/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/onnx/utils.py:1178: UserWarning: The shape inference of org.openvinotoolkit::FakeQuantize type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. (Triggered internally at ../torch/csrc/jit/passes/onnx/shape_type_inference.cpp:1884.) | |
_C._jit_pass_onnx_graph_shape_type_inference( | |
Exporting to onnx after training epoch 0 | |
Start training epoch 1 | |
Traceback (most recent call last): | |
File "debug_dp.py", line 69, in <module> | |
output = compressed_model_dp(**x_cuda) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 144, in wrapped | |
return module_call(self, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward | |
outputs = self.parallel_apply(replicas, inputs, kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply | |
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply | |
output.reraise() | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise | |
raise exception | |
RuntimeError: Caught RuntimeError in replica 1 on device 1. | |
Original Traceback (most recent call last): | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker | |
output = module(*input, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 144, in wrapped | |
return module_call(self, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/debug.py", line 68, in decorated | |
retval = forward_func(self, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/nncf_network.py", line 253, in forward | |
retval = self.get_nncf_wrapped_model()(*args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 164, in wrapped | |
retval = module_call(self, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "debug_dp.py", line 22, in forward | |
return self.linear(x) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 164, in wrapped | |
retval = module_call(self, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/layer_utils.py", line 78, in forward | |
results = forward_fn(proxy_module, *args) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward | |
return F.linear(input, self.weight, self.bias) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 108, in wrapped | |
result = _execute_op(op_address, operator_info, operator, ctx, *args, **kwargs) | |
File "/mnt/sh_flex_storage/home/yujiepan/tools/miniconda3/envs/nncf-dp-bugreport/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py", line 186, in _execute_op | |
result = operator(*args, **kwargs) | |
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat1 in method wrapper_addmm) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
attrs==22.2.0 | |
autograd==1.5 | |
autopep8==1.6.0 | |
certifi @ file:///croot/certifi_1671487769961/work/certifi | |
charset-normalizer==3.0.1 | |
cma==2.7.0 | |
coloredlogs==15.0.1 | |
contourpy==1.0.7 | |
cycler==0.11.0 | |
flatbuffers==23.1.21 | |
fonttools==4.38.0 | |
future==0.18.3 | |
humanfriendly==10.0 | |
idna==3.4 | |
importlib-resources==5.10.2 | |
joblib==1.2.0 | |
jsonschema==4.17.3 | |
jstyleson==0.0.2 | |
kiwisolver==1.4.4 | |
matplotlib==3.7.0 | |
mpmath==1.2.1 | |
natsort==8.2.0 | |
networkx==2.8.2 | |
ninja==1.10.2.4 | |
nncf @ git+https://github.com/openvinotoolkit/nncf@06bd55537abd6bc804a38372155e08d926d731ca | |
numpy==1.23.5 | |
onnx==1.12.0 | |
onnxruntime==1.13.1 | |
openvino-telemetry==2022.3.0 | |
packaging==23.0 | |
pandas==1.5.2 | |
Pillow==9.4.0 | |
pkgutil_resolve_name==1.3.10 | |
protobuf==3.20.1 | |
pydot==1.4.2 | |
pymoo==0.5.0 | |
pyparsing==2.4.7 | |
pyrsistent==0.19.3 | |
python-dateutil==2.8.2 | |
pytz==2022.7.1 | |
requests==2.28.2 | |
scikit-learn==1.2.1 | |
scipy==1.10.0 | |
six==1.16.0 | |
sympy==1.11.1 | |
texttable==1.6.7 | |
threadpoolctl==3.1.0 | |
toml==0.10.2 | |
torch==1.13.1+cu116 | |
torchaudio==0.13.1+cu116 | |
torchvision==0.14.1+cu116 | |
tqdm==4.64.1 | |
typing_extensions==4.5.0 | |
urllib3==1.26.14 | |
zipp==3.13.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Environment: NNCF
06bd55537abd6bc804a38372155e08d926d731ca
Run:
CUDA_VISIBLE_DEVICES=0,1 python debug_dp.py