Last active
February 5, 2022 01:51
-
-
Save davidberard98/c6c14c786e5d0e90f6efa38ed81df4a7 to your computer and use it in GitHub Desktop.
nvfuser opinfo tests - results feb 4, 2022
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
srun: job 21182 queued and waiting for resources | |
srun: job 21182 has been allocated resources | |
srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device | |
srun: error: Not using a pseudo-terminal, disregarding --pty option | |
monkeytype is not installed. Skipping tests for Profile-Directed Typing | |
test_nvfuser_correctness_H_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_H_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_T_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___radd___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:424: UserWarning: fast math disabled in nvfuser, try set `PYTORCH_NVFUSER_DISABLE_FASTMATH=0` (Triggered internally at ../torch/csrc/jit/codegen/cuda/executor_utils.cpp:705.) | |
return callable(*args, **kwargs) | |
ok | |
test_nvfuser_correctness___radd___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___radd___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___radd___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rand___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rdiv___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmatmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmod___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rmul___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rmul___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___ror___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_tensor.py:627: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(other, dtype=dtype, device=self.device) ** self | |
ERROR | |
test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness___rsub___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rsub___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness___rxor___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:333: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int16! Caching allocator allocated memory was 467456 and is now reported as 495616 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int8! Caching allocator allocated memory was 495616 and is now reported as 523776 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_uint8! Caching allocator allocated memory was 523776 and is now reported as 551936 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:336: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(torch.inf, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_bfloat16! Caching allocator allocated memory was 551936 and is now reported as 580096 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_float16! Caching allocator allocated memory was 580096 and is now reported as 608256 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:338: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(torch.iinfo(dtype).max, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int16! Caching allocator allocated memory was 608256 and is now reported as 636416 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:331: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(-torch.inf, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:386: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
elif mask.shape != input.shape: | |
ERROR | |
test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_mean_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:351: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(0, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:350: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
return torch.tensor(torch.inf, dtype=dtype, device=device) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_bfloat16! Caching allocator allocated memory was 809472 and is now reported as 950272 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float16! Caching allocator allocated memory was 950272 and is now reported as 1091072 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float32! Caching allocator allocated memory was 1091072 and is now reported as 1231872 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float64! Caching allocator allocated memory was 1231872 and is now reported as 1372672 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness__masked_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness__masked_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness__masked_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness__masked_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_bfloat16! Caching allocator allocated memory was 1372672 and is now reported as 1400832 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex128! Caching allocator allocated memory was 1401344 and is now reported as 1429504 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex64! Caching allocator allocated memory was 1429504 and is now reported as 1457664 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float16! Caching allocator allocated memory was 1457664 and is now reported as 1485824 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float64! Caching allocator allocated memory was 1513984 and is now reported as 1542144 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int16! Caching allocator allocated memory was 1542144 and is now reported as 1570304 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int64! Caching allocator allocated memory was 1570816 and is now reported as 1598976 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int8! Caching allocator allocated memory was 1598976 and is now reported as 1627136 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_uint8! Caching allocator allocated memory was 1627136 and is now reported as 1655296 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex128! Caching allocator allocated memory was 1737728 and is now reported as 1765888 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex64! Caching allocator allocated memory was 1765888 and is now reported as 1794048 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int16! Caching allocator allocated memory was 1794048 and is now reported as 1822208 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int64! Caching allocator allocated memory was 1822720 and is now reported as 1850880 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int8! Caching allocator allocated memory was 1850880 and is now reported as 1879040 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_uint8! Caching allocator allocated memory was 1879040 and is now reported as 1907200 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_abs_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_abs_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_acos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_acosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcdiv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_addcmul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addcmul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmm_decomposed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addmv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_addr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_all_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_allclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_allclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_allclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_allclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_allclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_allclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_aminmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_angle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_any_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argsort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_argwhere_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_as_strided_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_asinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_1d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_2d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_atleast_3d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_baddbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bernoulli_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bernoulli_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bernoulli_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bernoulli_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Casting complex values to real discards the imaginary part (Triggered internally at ../aten/src/ATen/native/Copy.cpp:239.) | |
ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bfloat16_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bincount_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int16! Caching allocator allocated memory was 1931776 and is now reported as 1932800 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_bincount_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int32! Caching allocator allocated memory was 1932800 and is now reported as 1933824 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_bincount_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int64! Caching allocator allocated memory was 1933824 and is now reported as 1934848 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_bincount_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int8! Caching allocator allocated memory was 1934848 and is now reported as 1935872 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_bincount_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_uint8! Caching allocator allocated memory was 1935872 and is now reported as 1936896 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_bitwise_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_left_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bitwise_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bitwise_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_right_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bitwise_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_block_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_bool_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bool_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_broadcast_to_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_bucketize_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_byte_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cartesian_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cdist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cdist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ceil_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ceil_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ceil_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ceil_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_char_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_cholesky_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_chunk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_chunk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_clamp_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clamp_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_clone_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_column_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:2246.) | |
ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
ok | |
test_nvfuser_correctness_combinations_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_combinations_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_complex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_complex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_conj_physical_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_contiguous_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_copysign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_corrcoef_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_cosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_count_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cov_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_bfloat16! Caching allocator allocated memory was 1936896 and is now reported as 1940992 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex128! Caching allocator allocated memory was 1940992 and is now reported as 1945088 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex64! Caching allocator allocated memory was 1945088 and is now reported as 1949184 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float16! Caching allocator allocated memory was 1949184 and is now reported as 1953280 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float32! Caching allocator allocated memory was 1953280 and is now reported as 1957376 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float64! Caching allocator allocated memory was 1957376 and is now reported as 1961472 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int16! Caching allocator allocated memory was 1961472 and is now reported as 1965568 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int32! Caching allocator allocated memory was 1965568 and is now reported as 1969664 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int64! Caching allocator allocated memory was 1969664 and is now reported as 1973760 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int8! Caching allocator allocated memory was 1973760 and is now reported as 1977856 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cov_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_uint8! Caching allocator allocated memory was 1977856 and is now reported as 1981952 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cummin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumprod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumsum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_cumulative_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_deg2rad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diag_embed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagflat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diagonal_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_diff_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /data/home/dberard/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at ../aten/src/ATen/native/cuda/jit_utils.cpp:844.) | |
ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
ok | |
test_nvfuser_correctness_digamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_digamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_floor_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_floor_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_floor_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_floor_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_floor_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_floor_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_no_rounding_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_trunc_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_trunc_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_trunc_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_trunc_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_div_trunc_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_div_trunc_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_double_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_double_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_double_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_dstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_einsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_einsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_einsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_einsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_einsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_einsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_empty_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_empty_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_eq_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_eq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_eq_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_eq_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_eq_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_eq_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_erfc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_erfinv_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_exp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_exp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expand_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_expm1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_fftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_hfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ifftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_ihfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_irfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fft_rfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fill__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flatten_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flip_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fliplr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_flipud_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_float_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_float_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_float_power_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. | |
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at ../aten/src/ATen/native/BinaryOps.cpp:607.) | |
ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
ok | |
test_nvfuser_correctness_floor_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_floor_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_fmod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_frac_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_frac_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_frac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_frac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_frexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_frexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_frexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_full_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gather_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gcd_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gcd_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gcd_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gcd_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gcd_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ge_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ge_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ge_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ge_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_geqrf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_geqrf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_geqrf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_geqrf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_gradient_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gradient_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_gt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_gt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_gt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_half_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_heaviside_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_histc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hypot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hypot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hypot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_hypot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_i0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igamma_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igamma_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igammac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igammac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igammac_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_igammac_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_imag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_imag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_copy_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_index_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inner_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_int_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_int_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_int_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_int_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_int_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isclose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isfinite_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isnan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isneginf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isposinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_isreal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_istft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex128! Caching allocator allocated memory was 1981952 and is now reported as 1982464 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_istft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex64! Caching allocator allocated memory was 1982464 and is now reported as 1982976 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_istft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:770: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:950.) | |
return _VF.istft(input, n_fft, hop_length, win_length, window, center, # type: ignore[attr-defined] | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float32! Caching allocator allocated memory was 1982976 and is now reported as 1983488 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_istft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float64! Caching allocator allocated memory was 1983488 and is now reported as 1984000 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_kron_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kron_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_kthvalue_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lcm_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lcm_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lcm_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lcm_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lcm_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ldexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_le_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_le_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_le_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_le_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lerp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lgamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lgamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lgamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lgamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cond_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cond_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cond_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cond_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_det_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_det_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_det_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_det_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_det_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_det_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvalsh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_eigvalsh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_householder_product_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_householder_product_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_householder_product_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_householder_product_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_inv_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lstsq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_lu_factor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_multi_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_pinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex128! Caching allocator allocated memory was 1984000 and is now reported as 1988096 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_linalg_pinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex64! Caching allocator allocated memory was 1988096 and is now reported as 1992192 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_linalg_pinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float32! Caching allocator allocated memory was 1992192 and is now reported as 1996288 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_linalg_pinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float64! Caching allocator allocated memory was 1996288 and is now reported as 2000384 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_nvfuser_correctness_linalg_pinv_singular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_nvfuser_correctness_linalg_pinv_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_nvfuser_correctness_linalg_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_slogdet_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_slogdet_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_slogdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_slogdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_solve_triangular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_solve_triangular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svdvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svdvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svdvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_svdvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_tensorinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_tensorinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_tensorinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_tensorinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_linalg_tensorsolve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_linalg_tensorsolve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_linalg_vector_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_vector_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_vector_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_vector_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_vector_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_linalg_vector_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_log10_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log10_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log10_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log10_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log10_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log10_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log1p_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_log_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_log_softmax_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logaddexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logcumsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logcumsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logcumsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logcumsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_logdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_logical_and_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logical_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_logsumexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_long_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_long_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_long_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_lt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_lu_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_lu_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_lu_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_lu_unpack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_unpack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_unpack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_lu_unpack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mH_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mT_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_masked_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_matrix_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_max_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_max_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_maximum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_median_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_median_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_min_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_min_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_minimum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_movedim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_msort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_mul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_multinomial_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_multinomial_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_multinomial_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nan_to_num_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanmedian_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanquantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nanquantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nansum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_narrow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ne_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ne_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ne_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ne_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_ne_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ne_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_empty_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_empty_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!' | |
test_nvfuser_correctness_new_full_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_full_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_ones_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_new_zeros_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nextafter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nextafter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nextafter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_batch_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2363: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if size_prods == 1: | |
ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_bilinear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_celu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_celu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_celu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_celu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Using padding='same' with even kernel lengths and odd dilation may require a zero-padded copy of the input be created (Triggered internally at ../aten/src/ATen/native/Convolution.cpp:744.) | |
ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
ok | |
test_nvfuser_correctness_nn_functional_conv2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... FAIL | |
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16! Caching allocator allocated memory was 2000384 and is now reported as 2001920 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16! Caching allocator allocated memory was 2001920 and is now reported as 2003456 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32! Caching allocator allocated memory was 2003456 and is now reported as 2004992 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64! Caching allocator allocated memory was 2004992 and is now reported as 2006528 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_dropout_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_elu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_elu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_elu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_elu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_embedding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2756: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if var.size() != input.size(): | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:2780: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if torch.any(var < 0): | |
ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_gelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_gelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_gelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_gelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_glu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_glu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_glu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_glu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2475: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). | |
_verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:])) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16! Caching allocator allocated memory was 2006528 and is now reported as 2010624 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_group_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float16! Caching allocator allocated memory was 2010624 and is now reported as 2014720 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_group_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float32! Caching allocator allocated memory was 2014720 and is now reported as 2018816 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_group_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float64! Caching allocator allocated memory was 2018816 and is now reported as 2022912 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_hardshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardswish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardswish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardswish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardswish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_huber_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3170: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if not (target.size() == input.size()): | |
ok | |
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_instance_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2408: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if size_prods == 1: | |
ERROR | |
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_instance_norm_cuda_float32! Caching allocator allocated memory was 2027008 and is now reported as 2041856 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64! Caching allocator allocated memory was 2041856 and is now reported as 2056704 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3826: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. | |
warnings.warn( | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:3848: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. | |
(torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float())) | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. | |
warnings.warn( | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. | |
warnings.warn( | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=linear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. | |
warnings.warn( | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=trilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. | |
warnings.warn( | |
ok | |
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2863: UserWarning: reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release. | |
warnings.warn( | |
ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_kl_div_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_layer_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_linear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_linear_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_linear_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:682: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:651: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
ok | |
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:780: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:749: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
ok | |
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:878: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:847: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release. | |
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change" | |
ok | |
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mse_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3228: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
if not (target.size() == input.size()): | |
ok | |
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16! Caching allocator allocated memory was 2056704 and is now reported as 2070528 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16! Caching allocator allocated memory was 2070528 and is now reported as 2084352 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32! Caching allocator allocated memory was 2084352 and is now reported as 2098176 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64! Caching allocator allocated memory was 2098176 and is now reported as 2112000 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_one_hot_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4746: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
assert padding[-(idx * 2 + 1)] <= size, "Padding value causes wrapping around more than once." | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:4747: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
assert padding[-(idx * 2 + 2)] <= size, "Padding value causes wrapping around more than once." | |
/fsx/users/dberard/pytorch/torch/nn/functional.py:4749: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! | |
assert ( | |
ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_circular_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_constant_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_prelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float16! Caching allocator allocated memory was 2112000 and is now reported as 2121216 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_prelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float32! Caching allocator allocated memory was 2121216 and is now reported as 2130432 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_prelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float64! Caching allocator allocated memory was 2130432 and is now reported as 2139648 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu6_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_relu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_rrelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_rrelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_rrelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_rrelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_selu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_selu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_selu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_selu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_silu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_silu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_silu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_silu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softplus_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softplus_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softplus_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softplus_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softsign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softsign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_softsign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_softsign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_threshold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_nn_functional_threshold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_threshold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4008: UserWarning: nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead. | |
warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.") | |
ok | |
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3953: UserWarning: nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead. | |
warnings.warn("nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.") | |
ok | |
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_fro_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_inf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_nuc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_nuc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_nuc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_norm_nuc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_number_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_number_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_number_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_normal_number_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ones_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ormqr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ormqr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ormqr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ormqr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_outer_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pca_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pca_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_permute_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pinverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pinverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pinverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pinverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_positive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_pow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_pow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_pow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_quantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_quantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rad2deg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rand_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randint_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_randn_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_ravel_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_real_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reciprocal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reciprocal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reciprocal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reciprocal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_reciprocal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reciprocal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_remainder_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_renorm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_repeat_interleave_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bfloat16! Caching allocator allocated memory was 2139648 and is now reported as 2140160 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bool! Caching allocator allocated memory was 2140160 and is now reported as 2140672 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex128! Caching allocator allocated memory was 2140672 and is now reported as 2141184 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex64! Caching allocator allocated memory was 2141184 and is now reported as 2141696 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float16! Caching allocator allocated memory was 2141696 and is now reported as 2142208 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float32! Caching allocator allocated memory was 2142208 and is now reported as 2142720 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float64! Caching allocator allocated memory was 2142720 and is now reported as 2143232 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int16! Caching allocator allocated memory was 2143232 and is now reported as 2143744 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int32! Caching allocator allocated memory was 2143744 and is now reported as 2144256 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int64! Caching allocator allocated memory was 2144256 and is now reported as 2144768 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int8! Caching allocator allocated memory was 2144768 and is now reported as 2145280 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_repeat_interleave_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_uint8! Caching allocator allocated memory was 2145280 and is now reported as 2145792 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_reshape_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_reshape_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resize_as__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_resolve_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_roll_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rot90_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_neg_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_neg_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_neg_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_round_decimals_neg_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_rsub_rsub_tensor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_scatter_reduce_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_scatter_reduce_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu' | |
test_nvfuser_correctness_searchsorted_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float16! Caching allocator allocated memory was 2145792 and is now reported as 2223616 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float32! Caching allocator allocated memory was 2223616 and is now reported as 2301440 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float64! Caching allocator allocated memory was 2301440 and is now reported as 2379264 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int16! Caching allocator allocated memory was 2379264 and is now reported as 2457088 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int32! Caching allocator allocated memory was 2457088 and is now reported as 2534912 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int64! Caching allocator allocated memory was 2534912 and is now reported as 2612736 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int8! Caching allocator allocated memory was 2612736 and is now reported as 2690560 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_searchsorted_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_uint8! Caching allocator allocated memory was 2690560 and is now reported as 2768384 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_select_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sgn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_short_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sigmoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sigmoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sigmoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sigmoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_signbit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_slice_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_softmax_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_softmax_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_softmax_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_softmax_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_softmax_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_sort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_entr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_erfcx_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i0e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_i1e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_ndtri_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_xlog1py_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_special_zeta_grad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_list_args_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_split_with_sizes_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_square_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_squeeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_std_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_stft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex128! Caching allocator allocated memory was 2768384 and is now reported as 2768896 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_stft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex64! Caching allocator allocated memory was 2768896 and is now reported as 2769408 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_stft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:695: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:798.) | |
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_float32! Caching allocator allocated memory was 2769408 and is now reported as 2769920 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_stft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_float64! Caching allocator allocated memory was 2769920 and is now reported as 2770432 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168. | |
warnings.warn(msg) | |
ok | |
test_nvfuser_correctness_sub_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sub_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sub_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sub_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sub_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sub_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_to_size_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_to_size_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_to_size_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_to_size_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_sum_to_size_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_sum_to_size_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_svd_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_svd_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_symeig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_symeig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_symeig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_symeig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_t_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_t_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_along_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_take_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensor_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tensordot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tile_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... [W pybind_utils.cpp:39] Warning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (function operator()) | |
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:424: UserWarning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (Triggered internally at ../torch/csrc/jit/python/pybind_utils.h:691.) | |
return callable(*args, **kwargs) | |
ok | |
test_nvfuser_correctness_to_sparse_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_to_sparse_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_topk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trace_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_transpose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trapz_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triangular_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_triangular_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_triangular_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_triangular_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected' | |
test_nvfuser_correctness_tril_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_tril_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_triu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_true_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_true_divide_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_true_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_true_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR | |
test_nvfuser_correctness_true_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_true_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trunc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trunc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trunc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_trunc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unfold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_consecutive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unique_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_unsqueeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_var_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_vdot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok | |
test_nvfuser_correctness_view_as_complex_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [0,0,0] Assertion `false` failed. | |
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [1,0,0] Assertion `false` failed. | |
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [2,0,0] Assertion `false` failed. | |
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [3,0,0] Assertion `false` failed. | |
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [4,0,0] Assertion `false` failed. | |
ERROR | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bfloat16! Caching allocator allocated memory was 512 and is now reported as 35328 on device 0. CUDA driver allocated memory was 1369440256 and is now 1371537408. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bool! Caching allocator allocated memory was 35328 and is now reported as 70144 on device 0. CUDA driver allocated memory was 1371537408 and is now 1373634560. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex128! Caching allocator allocated memory was 70144 and is now reported as 104960 on device 0. CUDA driver allocated memory was 1373634560 and is now 1375731712. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex64! Caching allocator allocated memory was 104960 and is now reported as 139776 on device 0. CUDA driver allocated memory was 1375731712 and is now 1377828864. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float16! Caching allocator allocated memory was 139776 and is now reported as 174592 on device 0. CUDA driver allocated memory was 1377828864 and is now 1379926016. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float32! Caching allocator allocated memory was 174592 and is now reported as 209408 on device 0. CUDA driver allocated memory was 1379926016 and is now 1382023168. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float64! Caching allocator allocated memory was 209408 and is now reported as 244224 on device 0. CUDA driver allocated memory was 1382023168 and is now 1384120320. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int16! Caching allocator allocated memory was 244224 and is now reported as 279040 on device 0. CUDA driver allocated memory was 1384120320 and is now 1386217472. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int32! Caching allocator allocated memory was 279040 and is now reported as 313856 on device 0. CUDA driver allocated memory was 1386217472 and is now 1388314624. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int64! Caching allocator allocated memory was 313856 and is now reported as 348672 on device 0. CUDA driver allocated memory was 1388314624 and is now 1390411776. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int8! Caching allocator allocated memory was 348672 and is now reported as 383488 on device 0. CUDA driver allocated memory was 1390411776 and is now 1392508928. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_uint8! Caching allocator allocated memory was 383488 and is now reported as 418304 on device 0. CUDA driver allocated memory was 1392508928 and is now 1394606080. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___radd___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: int | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rmul___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: int | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bfloat16! Caching allocator allocated memory was 418304 and is now reported as 422400 on device 0. CUDA driver allocated memory was 1675624448 and is now 1677721600. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bool! Caching allocator allocated memory was 422400 and is now reported as 426496 on device 0. CUDA driver allocated memory was 1677721600 and is now 1679818752. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex128! Caching allocator allocated memory was 426496 and is now reported as 430592 on device 0. CUDA driver allocated memory was 1679818752 and is now 1681915904. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex64! Caching allocator allocated memory was 430592 and is now reported as 434688 on device 0. CUDA driver allocated memory was 1681915904 and is now 1684013056. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float16! Caching allocator allocated memory was 434688 and is now reported as 438784 on device 0. CUDA driver allocated memory was 1684013056 and is now 1686110208. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float32! Caching allocator allocated memory was 438784 and is now reported as 442880 on device 0. CUDA driver allocated memory was 1686110208 and is now 1688207360. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float64! Caching allocator allocated memory was 442880 and is now reported as 446976 on device 0. CUDA driver allocated memory was 1688207360 and is now 1690304512. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int16! Caching allocator allocated memory was 446976 and is now reported as 451072 on device 0. CUDA driver allocated memory was 1690304512 and is now 1692401664. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int32! Caching allocator allocated memory was 451072 and is now reported as 455168 on device 0. CUDA driver allocated memory was 1692401664 and is now 1694498816. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int64! Caching allocator allocated memory was 455168 and is now reported as 459264 on device 0. CUDA driver allocated memory was 1694498816 and is now 1696595968. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int8! Caching allocator allocated memory was 459264 and is now reported as 463360 on device 0. CUDA driver allocated memory was 1696595968 and is now 1698693120. | |
====================================================================== | |
ERROR: test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_uint8! Caching allocator allocated memory was 463360 and is now reported as 467456 on device 0. CUDA driver allocated memory was 1698693120 and is now 1700790272. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Could not generate a max op for tensor with type: int | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int8! Caching allocator allocated memory was 636416 and is now reported as 664576 on device 0. CUDA driver allocated memory was 1700790272 and is now 1702887424. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_uint8! Caching allocator allocated memory was 664576 and is now reported as 692736 on device 0. CUDA driver allocated memory was 1702887424 and is now 1704984576. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16! Caching allocator allocated memory was 692736 and is now reported as 703488 on device 0. CUDA driver allocated memory was 1704984576 and is now 1707081728. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float16! Caching allocator allocated memory was 703488 and is now reported as 714240 on device 0. CUDA driver allocated memory was 1707081728 and is now 1709178880. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float32! Caching allocator allocated memory was 714240 and is now reported as 724992 on device 0. CUDA driver allocated memory was 1709178880 and is now 1711276032. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float64! Caching allocator allocated memory was 724992 and is now reported as 735744 on device 0. CUDA driver allocated memory was 1711276032 and is now 1713373184. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen { | |
typedef unsigned char uint8_t; | |
typedef signed char int8_t; | |
typedef short int int16_t; | |
typedef int int32_t; | |
typedef unsigned int uint32_t; | |
typedef long long int int64_t; | |
typedef unsigned long long int uint64_t; | |
typedef int nvfuser_index_t; | |
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_HALF_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __half; | |
__device__ __half __float2half(const float); | |
struct __align__(2) __half { | |
__half() = default; | |
__device__ __half(const float f) { | |
__x = __float2half(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __half __float2half(const float f) { | |
__half val; | |
asm("{ cvt.rn.f16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_HALF_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __half2float(const __half h) { | |
float val; | |
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h))); | |
return val; | |
} | |
// aligned vector generates vectorized load/store on CUDA | |
template <typename scalar_t, int vec_size> | |
struct alignas(sizeof(scalar_t) * vec_size) Array { | |
scalar_t val[vec_size]; | |
__device__ void set(scalar_t v) { | |
for (int i = 0; i < vec_size; ++i) { | |
val[i] = v; | |
} | |
} | |
}; | |
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_BFLOAT_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __bfloat; | |
__device__ __bfloat __float2bfloat(const float); | |
struct __align__(2) __bfloat { | |
__bfloat() = default; | |
__device__ __bfloat(const float f) { | |
__x = __float2bfloat(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __bfloat __float2bfloat(const float f) { | |
__bfloat val; | |
asm("{ cvt.rn.bf16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_BFLOAT_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __bfloat2float(const __bfloat h) { | |
float val; | |
asm("{ mov.b32 %0, {0,%1};}\n" | |
: "=f"(val) | |
: "h"(__NVFUSER_BFLOAT_TO_CUS(h))); | |
return val; | |
} | |
template <typename T, int N> | |
struct Tensor { | |
__device__ T& operator[](nvfuser_index_t ind) { | |
return data[ind]; | |
}; | |
T* data; | |
nvfuser_index_t size[N]; | |
nvfuser_index_t stride[N]; | |
}; | |
// Specialization for 0-dim case as it does not need size and stride arrays. | |
// They will be an error as well since zero-length arrays are not allowed. | |
template <typename T> | |
struct Tensor<T, 0> { | |
__device__ T& operator[](nvfuser_index_t) { | |
return *data; | |
}; | |
T* data; | |
}; | |
class Philox { | |
public: | |
__device__ Philox( | |
unsigned long long seed, | |
unsigned long long subsequence, | |
unsigned long long offset) { | |
key.x = (unsigned int)seed; | |
key.y = (unsigned int)(seed >> 32); | |
counter = make_uint4(0, 0, 0, 0); | |
counter.z = (unsigned int)(subsequence); | |
counter.w = (unsigned int)(subsequence >> 32); | |
STATE = 0; | |
incr_n(offset / 4); | |
} | |
__device__ unsigned long operator()() { | |
if (STATE == 0) { | |
uint4 counter_ = counter; | |
uint2 key_ = key; | |
for (int i = 0; i < 9; i++) { | |
counter_ = single_round(counter_, key_); | |
key_.x += (kPhilox10A); | |
key_.y += (kPhilox10B); | |
} | |
output = single_round(counter_, key_); | |
incr(); | |
} | |
unsigned long ret = 0; | |
switch (STATE) { | |
case 0: | |
ret = output.x; | |
break; | |
case 1: | |
ret = output.y; | |
break; | |
case 2: | |
ret = output.z; | |
break; | |
case 3: | |
ret = output.w; | |
break; | |
} | |
STATE = (STATE + 1) % 4; | |
return ret; | |
} | |
private: | |
__device__ void incr_n(unsigned long long n) { | |
unsigned int nlo = (unsigned int)(n); | |
unsigned int nhi = (unsigned int)(n >> 32); | |
counter.x += nlo; | |
if (counter.x < nlo) | |
nhi++; | |
counter.y += nhi; | |
if (nhi <= counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ void incr() { | |
if (++counter.x) | |
return; | |
if (++counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ unsigned int mulhilo32( | |
unsigned int a, | |
unsigned int b, | |
unsigned int* result_high) { | |
*result_high = __umulhi(a, b); | |
return a * b; | |
} | |
__device__ uint4 single_round(uint4 ctr, uint2 key) { | |
unsigned int hi0; | |
unsigned int hi1; | |
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); | |
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); | |
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; | |
return ret; | |
} | |
private: | |
static constexpr unsigned long kPhilox10A = 0x9E3779B9; | |
static constexpr unsigned long kPhilox10B = 0xBB67AE85; | |
static constexpr unsigned long kPhiloxSA = 0xD2511F53; | |
static constexpr unsigned long kPhiloxSB = 0xCD9E8D57; | |
uint4 counter = {}; | |
uint4 output = {}; | |
uint2 key = {}; | |
unsigned int STATE = 0; | |
}; | |
__device__ float uniformf(unsigned int x) { | |
constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32. | |
return x * kRanInvM32; | |
} | |
__device__ double uniform(unsigned int x, unsigned int y) { | |
constexpr double kRan2Pow53Inv = 1.1102230246251565e-16; | |
const unsigned long long z = | |
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32)); | |
return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0); | |
} | |
#define NVFUSER_DEFINE_MAGIC_ZERO \ | |
__shared__ int nvfuser_zero_s; \ | |
if (threadIdx.x == 0) \ | |
nvfuser_zero_s = 0; \ | |
__syncthreads(); \ | |
atomicMin(&nvfuser_zero_s, threadIdx.x); \ | |
int nvfuser_zero = nvfuser_zero_s; | |
#define NVFUSER_UPDATE_MAGIC_ZERO \ | |
do { \ | |
nvfuser_zero <<= 1; \ | |
} while (0); | |
__device__ constexpr int ceilDiv(int a, int b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int b) { | |
return ceilDiv(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t ceilDiv(int a, int64_t b) { | |
return ceilDiv((int64_t)a, b); | |
} | |
__device__ constexpr int max(int a, int b) { | |
return ::max(a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int b) { | |
return ::max(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t max(int a, int64_t b) { | |
return ::max((int64_t)a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int64_t b) { | |
return ::max(a, b); | |
} | |
__device__ double fmax(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ float fmax(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ constexpr int min(int a, int b) { | |
return ::min(a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int b) { | |
return ::min(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t min(int a, int64_t b) { | |
return ::min((int64_t)a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int64_t b) { | |
return ::min(a, b); | |
} | |
__device__ double fmin(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ float fmin(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ constexpr int alignBufferSize(int buffer, int size) { | |
return (buffer + (size - 1)) & ~(size - 1); | |
} | |
__device__ double clamp(double x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ float clamp(float x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ double frac(double x) { | |
return x - trunc(x); | |
} | |
__device__ float frac(float x) { | |
return x - trunc(x); | |
} | |
__device__ double gelu(double x) { | |
return x * normcdf(x); | |
} | |
__device__ float gelu(float x) { | |
return x * normcdf(x); | |
} | |
__device__ double reciprocal(double x) { | |
return 1 / x; | |
} | |
__device__ float reciprocal(float x) { | |
return 1 / x; | |
} | |
__device__ double relu(double x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(float x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int64_t x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ double remainder(double a, double b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ float remainder(float a, float b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ double sigmoid(double x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ float sigmoid(float x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ double silu(double x) { | |
return x * sigmoid(x); | |
} | |
__device__ float silu(float x) { | |
return x * sigmoid(x); | |
} | |
__device__ double threshold(double x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ float threshold(float x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ double where(bool c, double a, double b) { | |
return c ? a : b; | |
} | |
__device__ float where(bool c, float a, float b) { | |
return c ? a : b; | |
} | |
__device__ int64_t where(bool c, int64_t a, int64_t b) { | |
return c ? a : b; | |
} | |
__device__ double randLike(Philox& rnd) { | |
return uniform(rnd(), rnd()); | |
} | |
__device__ float randLikef(Philox& rnd) { | |
return uniformf(rnd()); | |
} | |
__device__ constexpr int64_t remainder(int64_t a, int64_t b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int remainder(int a, int b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int64_t fmod(int64_t a, int64_t b) { | |
return a % b; | |
} | |
__device__ constexpr int fmod(int a, int b) { | |
return a % b; | |
} | |
__device__ constexpr double fmod(double a, double b) { | |
return ::fmod(a, b); | |
} | |
__device__ constexpr float fmod(float a, float b) { | |
return ::fmod(a, b); | |
} | |
template <typename T> | |
__device__ T pow(T a, T b) { | |
if (b < 0) { | |
if (a == 1) { | |
return 1; | |
} else if (a == -1) { | |
auto negative = (-b) % static_cast<T>(2); | |
return negative ? -1 : 1; | |
} else { | |
return 0; | |
} | |
} else { | |
T result = 1; | |
while (b) { | |
if (b & 1) { | |
result *= a; | |
} | |
b /= 2; | |
a *= a; | |
} | |
return result; | |
} | |
} | |
template int pow<int>(int a, int b); | |
template int64_t pow<int64_t>(int64_t a, int64_t b); | |
template <> | |
float pow<float>(float a, float b) { | |
return ::pow(a, b); | |
} | |
template <> | |
double pow<double>(double a, double b) { | |
return ::pow(a, b); | |
} | |
namespace index_utils { | |
// Utility functions | |
// Total size of provided dimension | |
template <typename _dim3> | |
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) { | |
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z; | |
} | |
// Linearized indexing of idx based on dim, if bool==false that dimension does | |
// not participate | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = 0; | |
if (Z) | |
offset += idx.z; | |
if (Y) | |
offset = offset * dim.y + idx.y; | |
if (X) | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Linearized indexing of idx based on dim. All dimensions participate. | |
template <typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = idx.z; | |
offset = offset * dim.y + idx.y; | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Masks the provided dim3, those == false get truncated to 1 | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ dim3 maskedDims(const _dim3& dim) { | |
return dim3{ | |
X ? (unsigned)dim.x : 1U, | |
Y ? (unsigned)dim.y : 1U, | |
Z ? (unsigned)dim.z : 1U}; | |
} | |
// Provides total size of dim with masking, those dims == false do not | |
// participate in the size calculation | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3> | |
__device__ nvfuser_index_t maskedSize(const _dim3& dim) { | |
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim)); | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ bool maskedIsZero(const _dim3& idx) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == 0; | |
if (Y) | |
isZero = isZero && idx.y == 0; | |
if (Z) | |
isZero = isZero && idx.z == 0; | |
return isZero; | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == dim.x - 1; | |
if (Y) | |
isZero = isZero && idx.y == dim.y - 1; | |
if (Z) | |
isZero = isZero && idx.z == dim.z - 1; | |
return isZero; | |
} | |
} // namespace index_utils | |
// Default block synchronization. Just use __barrier_sync | |
namespace block_sync { | |
__forceinline__ __device__ void init() {} | |
// Thread-block synchronization | |
__forceinline__ __device__ void sync() { | |
__barrier_sync(0); | |
} | |
} // namespace block_sync | |
namespace grid_sync { | |
// Get the first bit in a 64 bit integer | |
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1)) | |
template <typename T> | |
__device__ T globalAsVolatile(volatile T& global_val) { | |
return global_val; | |
} | |
// A grid synchronization that can be called multiple times in a kernel assuming | |
// all the blocks fit on device at once. The semaphore is an integer semaphore | |
// assumed to be initialized to 0 before launching the kernel. The persistent | |
// option should be envoked if this sync will be called multiple times in one | |
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs | |
// called once in the same kernel does not require persistent mode. Segment size | |
// is the number of blocks participating in the sync in the dimensions marked by | |
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E. | |
// Marking X and Y but not Z means there should be Z semaphores of size X*Y. | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT> | |
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { | |
// Finish all global memory transactions before synchronizing | |
__threadfence(); | |
// Synchronize all threads in a block before synchronizing blocks | |
block_sync::sync(); | |
// Only allow linear_tid == 0 to participate in the synchronization | |
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { | |
// Get increment value, only want a single block to have the large | |
// increment, doesn't really matter which one, the goal is to flip/flop the | |
// first bit of a uint64_t value, since our semaphores are actualy int64_t | |
// we will just reinterpret_cast it to act as a uint64_t | |
uint64_t semaphore_increment = 1; | |
// Makes the assumption that blocks are in increasing order, this is not | |
// guaranteed by CUDA but this is the current behavior, and unlikely to | |
// change. | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1); | |
} | |
uint64_t oldArrive = | |
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment); | |
// If for persistent kernels, lock all blocks until the semaphore has been | |
// reached. Make sure we access semaphore as a volatile address so we get | |
// the global memory updates. | |
while ((PERSISTENT || last_block) && | |
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) == | |
0) { | |
// Put a sleep here so we have some breaks in probing the global | |
// semaphore, giving a better chance for other warps/blocks to catch up. | |
#if __CUDA_ARCH__ >= 700 | |
__nanosleep(200); | |
#else | |
// __nanosleep is not available for sm < 70 | |
assert(false); | |
#endif | |
} | |
} | |
// Sync block to make sure all other threads are waiting on the sync | |
block_sync::sync(); | |
} | |
} // namespace grid_sync | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. If set to false the dimension doesn't | |
// participate in the reduction. We could start with warp reductions, then | |
// reduce the warps, this could save some shared memory, but could be slower in | |
// some instances. | |
// | |
// EXAMPLE USAGE: | |
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS> | |
// (output[output_index], inputs[input_index], | |
// [] __device__ (T& a, const T b) { a += b; }); | |
// | |
// Note: We agressively template functions taking dim3 in the functions below | |
// because ROCM uses different types for the various dim3 and maps them | |
// directly to intrinsics, but they're dim3 when used after modification. | |
// | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
// Initialize shared memory | |
if (read_pred) { | |
shared_mem[smem_offset] = inp_val; | |
} else { | |
shared_mem[smem_offset] = init_val; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2 for the tree reduction: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T result = out; | |
reduction_op(result, shared_mem[smem_offset]); | |
if (reduction_size > 1) { | |
reduction_op(result, shared_mem[smem_offset + 1]); | |
} | |
out = result; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>( | |
out, | |
inp_val, | |
reduction_op, | |
thread_idx, | |
block_dim, | |
shared_mem, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// Inter-block reduction. | |
// | |
// The gridReduce function performs point-wise reductions of scalars across | |
// thread blocks. Thread blocks are disjointly partitioned into groups, | |
// "reduction segments", that are collectively defined by boolean template | |
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines | |
// whether thread blocks along the dimension should be grouped into the same | |
// reduction segment. Cross-block reducitons are independently done within each | |
// segment and generates distinctive results per segment. For instance, if all | |
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks | |
// since there will be just a single segment consisting of all thread blocks. If | |
// none of them are true, each thread block will become a segment by itself, so | |
// no reduction will be performed. | |
// | |
// The input scalars to reduce within each segment are a certain subset of | |
// thread-private scalars provided as part of the gridReduce function | |
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD, | |
// determine which subset of the scalars should be used for inter-block | |
// reductions. Specifically, all the input scalars of threads along each | |
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value | |
// held at offset 0 of each dimension will be used. Thus, for example, if all of | |
// X/Y/Z_THREAD are true, the scalars of all threads in each block will | |
// participate in inter-block reductions. If all of them are false, only one | |
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will | |
// be used. In the code below, we call the subset of threads a "reduction | |
// block". "Participating" thread dimensions here are similar to the | |
// "non-participating" block dimensions. They come from a block dimension that | |
// has not been reduced before hitting this grid reduction. | |
// | |
// Inter-block reductions perform point-wise reductions of scalars of reduction | |
// blocks within each reduction segment. More specifically, let rb be a | |
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx) | |
// denote the input scalar of thread at thread_idx and block_idx. The result of | |
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for | |
// each thread_idx in thread block block_idx_out in the segment as follows: | |
// | |
// OUT(thread_idx, block_idx_out) = | |
// Reduction of IN(thread_idx, block_idx) for | |
// all block_idx in a reduction segment | |
// | |
// OUT is not given for all threads that are not in block_idx_out and the | |
// reduction block. | |
// | |
// See also the function comment of gridReduce. | |
namespace reduction { | |
// Reduces all the reduction blocks in each reduction segment. This is the | |
// "cleanup" stage of a grid reduction. | |
// | |
// This is only called by one thread block per reduction segment. The input | |
// reduction blocks of the segment are stored in an intermediate buffer pointed | |
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction | |
// block is formed. | |
// | |
// The size of a reduction block is by definition smaller or equal to the size | |
// of a thread block. We use the remaining threads to parallelize reductions | |
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false, | |
// false}, we use blockDim.y*blockDim.z threads for each output value. This is | |
// done first by loading the input values in parallel and then by reducing | |
// across threads of dimensions whose XYZ_THREAD are false. | |
// | |
// Note that what is done here after the loading from global memory is similar | |
// to what the existing blockReduce function does. | |
template < | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T, | |
typename Func> | |
__device__ void gridReduceLastBlock( | |
T& out, | |
const T* in, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
Func reduction_op, | |
T* shared_buf, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp = init_val; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
reduction_op(inp, in[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_tmp = init_val; | |
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_tmp, | |
inp, | |
reduction_op, | |
threadIdx, | |
blockDim, | |
shared_buf, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
reduction_op(out, inp_tmp); | |
} | |
} | |
// Reduces per-thread values across thread blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - reduction_op: Scalar reduction function | |
// - work_buf: Temporary buffer for cross-block reductions | |
// - sync_flags: A vector of integers for synchronizations | |
// - shared_buf: Shared memory buffer for intra-block reduction | |
// | |
// Thread has valid results based on if it's the last block in the grid | |
// reduction dimension | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate | |
// in the cross-block reduction. Otherwise, only threads at offset 0 do. | |
// These are set to true if the dimension in the block has not been reduced | |
// previously in producer tensors, and does not participate in the reduction | |
// (right now they can't), so it's just a "pure" iteration domain as far as | |
// the grid reduce is concerned. | |
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or | |
// the result of the grid reduction will be broadcasted and used across the | |
// grid. These requires cross grid communication and the grid synchronizations | |
// here to actually synchronize across the entire grid. When false the grid is | |
// not synchronized, the last block just waits for everyone else to finish and | |
// the other blocks can exit early. | |
// - T: Scalar data type of input/output data | |
// - Func: Type of scalara reduction function | |
// | |
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are | |
// reduced together. We call it a reduction segment. Some examples are: | |
// | |
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which | |
// includes all thread blocks. It is effecively the same as the grid. | |
// | |
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an | |
// individual segment by itself. | |
// | |
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread | |
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z | |
// such segments. | |
// | |
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced | |
// with the sub regions of other thread blocks. We call it a reduction block. | |
// E.g., | |
// | |
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in | |
// the cross-block reductions. The reduction block is 1x1x1 with thread 0. | |
// | |
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block | |
// participate in the cross-block reductions. The reduction block in this case | |
// is equivalent to the thread block. | |
// | |
// After the function completes, only one thread block per reduction segment | |
// gets valid reduction results. There is no guarantee which particular block | |
// gets the final results. | |
// | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename Func> | |
__device__ void gridReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf[work_buf_offset] = inp_val; | |
} else { | |
work_buf[work_buf_offset] = init_val; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// Cleanup with block reduction | |
gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out, | |
(T*)work_buf, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
reduction_op, | |
shared_buf, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace reduction | |
#undef isize | |
#undef ioffset | |
namespace grid_broadcast { | |
// Broadcasts per-thread values across threads and blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - work_buf: Temporary buffer for communication across threads/blocks | |
// - sync_flags: A vector of integers for synchronizations | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z | |
// dimensions | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T> | |
__device__ void broadcast( | |
T& out, | |
const T& inp_val, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
bool read_write_pred) { | |
// Number of values broadcasted in the grid dimensions | |
const auto grid_seg_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the broadcast we're performing out of the grid_seg_size | |
const auto grid_seg_idx = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads not participating in a broadcast dimension, this is the | |
// number of thread entries to expect in the work buffer, therefore a striding | |
const auto block_stride = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
// Which broadcast in the block this is to line up the entry with the work | |
// buffer | |
const auto thread_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) && | |
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) && | |
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) && | |
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && | |
(!Z_THREAD || threadIdx.z == 0); | |
if (has_valid_data && read_write_pred) { | |
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val; | |
__threadfence(); | |
} | |
bool null = false; | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
if (read_write_pred) { | |
out = work_buf[grid_seg_idx * block_stride + thread_offset]; | |
} | |
// Make sure everyone has read from the buffer before continuing the kernel | |
// and potentially overwriting | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
} | |
} // namespace grid_broadcast | |
namespace broadcast { | |
// Broadcasts within partitioned groups of threads. | |
// | |
// X_THREAD: Broadcast from threadIdx.x == 0 if true | |
// Y_THREAD: Broadcast from threadIdx.y == 0 if true | |
// Z_THREAD: Broadcast from threadIdx.z == 0 if true | |
// inp_val: Per-thread source value. Only valid when the thread is a source. | |
// out: Per-thread output location | |
// | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T> | |
__device__ void blockBroadcast( | |
T& out, | |
const T& inp_val, | |
T* shared_mem, | |
bool read_write_pred) { | |
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && | |
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0); | |
const auto shared_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
if (has_valid_data && read_write_pred) { | |
shared_mem[shared_offset] = inp_val; | |
} | |
block_sync::sync(); | |
if (read_write_pred) { | |
out = shared_mem[shared_offset]; | |
} | |
block_sync::sync(); | |
} | |
} // namespace broadcast | |
// ----------------------------------------------------------------------------------------------- | |
// Block Welford Primitives | |
// ----------------------------------------------------------------------------------------------- | |
// Basic utility for welford update. Can be used to scan one value, or two merge | |
// two welford results | |
template <typename T, typename TN> | |
__inline__ __device__ void welfordCombine( | |
T& a_avg, | |
T& a_M2, | |
TN& a_N, | |
const T& b_avg, | |
const T& b_M2, | |
TN b_N) { | |
if (b_N == 0) { | |
return; | |
} | |
TN ab_N = a_N + b_N; | |
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N)); | |
T delta = b_avg - a_avg; | |
a_avg += delta * b_N_div_ab_N; | |
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N; | |
a_N = ab_N; | |
} | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
if (read_pred) { | |
shared_mem_avg[smem_offset] = in_avg; | |
shared_mem_M2[smem_offset] = in_M2; | |
shared_mem_N[smem_offset] = in_N; | |
} else { | |
shared_mem_avg[smem_offset] = init_val; | |
shared_mem_M2[smem_offset] = init_val; | |
shared_mem_N[smem_offset] = 0; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + np2], | |
shared_mem_M2[smem_offset + np2], | |
shared_mem_N[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + factor], | |
shared_mem_M2[smem_offset + factor], | |
shared_mem_N[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T res_avg = out_avg; | |
T res_M2 = out_M2; | |
TN res_N = out_N; | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset]); | |
if (reduction_size > 1) { | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset + 1], | |
shared_mem_M2[smem_offset + 1], | |
shared_mem_N[smem_offset + 1]); | |
} | |
out_avg = res_avg; | |
out_M2 = res_M2; | |
out_N = res_N; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_write_pred, | |
T init_val) { | |
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>( | |
out_avg, | |
out_M2, | |
out_N, | |
in_avg, | |
in_M2, | |
in_N, | |
thread_idx, | |
block_dim, | |
shared_mem_avg, | |
shared_mem_M2, | |
shared_mem_N, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// ----------------------------------------------------------------------------------------------- | |
// Grid Welford Prototype | |
// ----------------------------------------------------------------------------------------------- | |
namespace welford { | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN> | |
__device__ void gridWelfordLastBlock( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T* in_avg, | |
const T* in_M2, | |
const TN* in_N, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp_avg = init_val; | |
T inp_M2 = init_val; | |
TN inp_N = 0; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
welfordCombine( | |
inp_avg, | |
inp_M2, | |
inp_N, | |
in_avg[work_buf_offset], | |
in_M2[work_buf_offset], | |
in_N[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_avg_tmp = init_val; | |
T inp_M2_tmp = init_val; | |
TN inp_N_tmp = 0; | |
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_avg_tmp, | |
inp_M2_tmp, | |
inp_N_tmp, | |
inp_avg, | |
inp_M2, | |
inp_N, | |
threadIdx, | |
blockDim, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp); | |
} | |
} | |
// Grid welford combine | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename TN> | |
__device__ void gridWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& inp_avg, | |
const T& inp_M2, | |
const TN& inp_N, | |
volatile T* work_buf_avg, | |
volatile T* work_buf_M2, | |
volatile TN* work_buf_N, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_N += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf_avg[work_buf_offset] = inp_avg; | |
work_buf_M2[work_buf_offset] = inp_M2; | |
work_buf_N[work_buf_offset] = inp_N; | |
} else { | |
work_buf_avg[work_buf_offset] = init_val; | |
work_buf_M2[work_buf_offset] = init_val; | |
work_buf_N[work_buf_offset] = 0; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// final reduction | |
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out_avg, | |
out_M2, | |
out_N, | |
(T*)work_buf_avg, | |
(T*)work_buf_M2, | |
(TN*)work_buf_N, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace welford | |
#undef isize | |
#undef ioffset | |
namespace warp { | |
template < | |
bool SINGLE_WARP, | |
typename T, | |
typename Func, | |
typename _dim3ti, | |
typename _dim3bd> | |
__device__ void warpReduceTIDX( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3ti& thread_idx, | |
const _dim3bd& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
constexpr int WARP_SIZE = 32; | |
// Assume input padded to multiples of a warp | |
T reduce_val = init_val; | |
// Do warp reduction | |
if (read_write_pred) { | |
reduce_val = inp_val; | |
} | |
// Reduce within each warp | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE)); | |
} | |
// Reduce across warp if needed | |
// Load value to shared mem | |
if (!SINGLE_WARP) { | |
unsigned int warp_idx = thread_idx.x / WARP_SIZE; | |
unsigned int lane_idx = thread_idx.x % WARP_SIZE; | |
unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y; | |
bool is_warp_head = lane_idx == 0; | |
unsigned int reduction_size = block_dim.x; | |
unsigned int num_of_warps = reduction_size / WARP_SIZE; | |
unsigned int smem_offset = reduce_group_id * num_of_warps; | |
block_sync::sync(); | |
if (read_write_pred && is_warp_head) { | |
shared_mem[smem_offset + warp_idx] = reduce_val; | |
} | |
block_sync::sync(); | |
if (warp_idx == 0) { | |
// This assumes num_of_warps will be < 32, meaning < 1024 blocks. | |
// Should be true for long enough. | |
assert(num_of_warps <= 32); | |
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx] | |
: init_val; | |
// Reduce within warp 0 | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32)); | |
} | |
} | |
if (is_warp_head) { | |
reduction_op(out, reduce_val); | |
} | |
} else { | |
reduction_op(out, reduce_val); | |
} | |
} | |
} // namespace warp | |
// No "#pragma once" because this is a raw definition that can be copied by jit codegen. | |
// Eager mode clients should not include this file directly, instead, | |
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once. | |
// Stores RNG state values. Passed as a kernel argument. | |
// See Note [CUDA Graph-safe RNG states]. | |
// | |
// The raw definition lives in its own file so jit codegen can easily copy it. | |
namespace at { | |
struct PhiloxCudaState { | |
PhiloxCudaState() = default; | |
// Called if graph capture is not underway | |
PhiloxCudaState(uint64_t seed, | |
uint64_t offset) { | |
seed_ = seed; | |
offset_.val = offset; | |
} | |
// Called if graph capture is underway | |
PhiloxCudaState(uint64_t seed, | |
int64_t* offset_extragraph, | |
uint32_t offset_intragraph) { | |
seed_ = seed; | |
offset_.ptr = offset_extragraph; | |
offset_intragraph_ = offset_intragraph; | |
captured_ = true; | |
} | |
// Public members, directly accessible by at::cuda::philox::unpack. | |
// If we made them private with getters/setters, the getters/setters | |
// would have to be __device__, and we can't declare __device__ in ATen. | |
union Payload { | |
uint64_t val; | |
int64_t* ptr; | |
}; | |
uint64_t seed_ = 0; | |
Payload offset_; | |
uint32_t offset_intragraph_ = 0; | |
bool captured_ = false; | |
}; | |
} // namespace at | |
__global__ void kernel127(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<bool, 0> T6, Tensor<int64_t, 0> T5) { | |
T6[0] | |
= where(T0[0], T1[0], T2[0]); | |
T5[0] | |
= where(T0[0], T3[0], T4[0]); | |
} | |
} | |
CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list: | |
function "CudaCodeGen::where(__nv_bool, double, double)" | |
function "CudaCodeGen::where(__nv_bool, float, float)" | |
function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)" | |
argument types are: (__nv_bool, __nv_bool, __nv_bool) | |
1 error detected in the compilation of "default_program". | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen { | |
typedef unsigned char uint8_t; | |
typedef signed char int8_t; | |
typedef short int int16_t; | |
typedef int int32_t; | |
typedef unsigned int uint32_t; | |
typedef long long int int64_t; | |
typedef unsigned long long int uint64_t; | |
typedef int nvfuser_index_t; | |
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_HALF_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __half; | |
__device__ __half __float2half(const float); | |
struct __align__(2) __half { | |
__half() = default; | |
__device__ __half(const float f) { | |
__x = __float2half(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __half __float2half(const float f) { | |
__half val; | |
asm("{ cvt.rn.f16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_HALF_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __half2float(const __half h) { | |
float val; | |
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h))); | |
return val; | |
} | |
// aligned vector generates vectorized load/store on CUDA | |
template <typename scalar_t, int vec_size> | |
struct alignas(sizeof(scalar_t) * vec_size) Array { | |
scalar_t val[vec_size]; | |
__device__ void set(scalar_t v) { | |
for (int i = 0; i < vec_size; ++i) { | |
val[i] = v; | |
} | |
} | |
}; | |
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_BFLOAT_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __bfloat; | |
__device__ __bfloat __float2bfloat(const float); | |
struct __align__(2) __bfloat { | |
__bfloat() = default; | |
__device__ __bfloat(const float f) { | |
__x = __float2bfloat(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __bfloat __float2bfloat(const float f) { | |
__bfloat val; | |
asm("{ cvt.rn.bf16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_BFLOAT_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __bfloat2float(const __bfloat h) { | |
float val; | |
asm("{ mov.b32 %0, {0,%1};}\n" | |
: "=f"(val) | |
: "h"(__NVFUSER_BFLOAT_TO_CUS(h))); | |
return val; | |
} | |
template <typename T, int N> | |
struct Tensor { | |
__device__ T& operator[](nvfuser_index_t ind) { | |
return data[ind]; | |
}; | |
T* data; | |
nvfuser_index_t size[N]; | |
nvfuser_index_t stride[N]; | |
}; | |
// Specialization for 0-dim case as it does not need size and stride arrays. | |
// They will be an error as well since zero-length arrays are not allowed. | |
template <typename T> | |
struct Tensor<T, 0> { | |
__device__ T& operator[](nvfuser_index_t) { | |
return *data; | |
}; | |
T* data; | |
}; | |
class Philox { | |
public: | |
__device__ Philox( | |
unsigned long long seed, | |
unsigned long long subsequence, | |
unsigned long long offset) { | |
key.x = (unsigned int)seed; | |
key.y = (unsigned int)(seed >> 32); | |
counter = make_uint4(0, 0, 0, 0); | |
counter.z = (unsigned int)(subsequence); | |
counter.w = (unsigned int)(subsequence >> 32); | |
STATE = 0; | |
incr_n(offset / 4); | |
} | |
__device__ unsigned long operator()() { | |
if (STATE == 0) { | |
uint4 counter_ = counter; | |
uint2 key_ = key; | |
for (int i = 0; i < 9; i++) { | |
counter_ = single_round(counter_, key_); | |
key_.x += (kPhilox10A); | |
key_.y += (kPhilox10B); | |
} | |
output = single_round(counter_, key_); | |
incr(); | |
} | |
unsigned long ret = 0; | |
switch (STATE) { | |
case 0: | |
ret = output.x; | |
break; | |
case 1: | |
ret = output.y; | |
break; | |
case 2: | |
ret = output.z; | |
break; | |
case 3: | |
ret = output.w; | |
break; | |
} | |
STATE = (STATE + 1) % 4; | |
return ret; | |
} | |
private: | |
__device__ void incr_n(unsigned long long n) { | |
unsigned int nlo = (unsigned int)(n); | |
unsigned int nhi = (unsigned int)(n >> 32); | |
counter.x += nlo; | |
if (counter.x < nlo) | |
nhi++; | |
counter.y += nhi; | |
if (nhi <= counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ void incr() { | |
if (++counter.x) | |
return; | |
if (++counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ unsigned int mulhilo32( | |
unsigned int a, | |
unsigned int b, | |
unsigned int* result_high) { | |
*result_high = __umulhi(a, b); | |
return a * b; | |
} | |
__device__ uint4 single_round(uint4 ctr, uint2 key) { | |
unsigned int hi0; | |
unsigned int hi1; | |
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); | |
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); | |
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; | |
return ret; | |
} | |
private: | |
static constexpr unsigned long kPhilox10A = 0x9E3779B9; | |
static constexpr unsigned long kPhilox10B = 0xBB67AE85; | |
static constexpr unsigned long kPhiloxSA = 0xD2511F53; | |
static constexpr unsigned long kPhiloxSB = 0xCD9E8D57; | |
uint4 counter = {}; | |
uint4 output = {}; | |
uint2 key = {}; | |
unsigned int STATE = 0; | |
}; | |
__device__ float uniformf(unsigned int x) { | |
constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32. | |
return x * kRanInvM32; | |
} | |
__device__ double uniform(unsigned int x, unsigned int y) { | |
constexpr double kRan2Pow53Inv = 1.1102230246251565e-16; | |
const unsigned long long z = | |
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32)); | |
return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0); | |
} | |
#define NVFUSER_DEFINE_MAGIC_ZERO \ | |
__shared__ int nvfuser_zero_s; \ | |
if (threadIdx.x == 0) \ | |
nvfuser_zero_s = 0; \ | |
__syncthreads(); \ | |
atomicMin(&nvfuser_zero_s, threadIdx.x); \ | |
int nvfuser_zero = nvfuser_zero_s; | |
#define NVFUSER_UPDATE_MAGIC_ZERO \ | |
do { \ | |
nvfuser_zero <<= 1; \ | |
} while (0); | |
__device__ constexpr int ceilDiv(int a, int b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int b) { | |
return ceilDiv(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t ceilDiv(int a, int64_t b) { | |
return ceilDiv((int64_t)a, b); | |
} | |
__device__ constexpr int max(int a, int b) { | |
return ::max(a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int b) { | |
return ::max(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t max(int a, int64_t b) { | |
return ::max((int64_t)a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int64_t b) { | |
return ::max(a, b); | |
} | |
__device__ double fmax(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ float fmax(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ constexpr int min(int a, int b) { | |
return ::min(a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int b) { | |
return ::min(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t min(int a, int64_t b) { | |
return ::min((int64_t)a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int64_t b) { | |
return ::min(a, b); | |
} | |
__device__ double fmin(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ float fmin(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ constexpr int alignBufferSize(int buffer, int size) { | |
return (buffer + (size - 1)) & ~(size - 1); | |
} | |
__device__ double clamp(double x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ float clamp(float x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ double frac(double x) { | |
return x - trunc(x); | |
} | |
__device__ float frac(float x) { | |
return x - trunc(x); | |
} | |
__device__ double gelu(double x) { | |
return x * normcdf(x); | |
} | |
__device__ float gelu(float x) { | |
return x * normcdf(x); | |
} | |
__device__ double reciprocal(double x) { | |
return 1 / x; | |
} | |
__device__ float reciprocal(float x) { | |
return 1 / x; | |
} | |
__device__ double relu(double x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(float x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int64_t x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ double remainder(double a, double b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ float remainder(float a, float b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ double sigmoid(double x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ float sigmoid(float x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ double silu(double x) { | |
return x * sigmoid(x); | |
} | |
__device__ float silu(float x) { | |
return x * sigmoid(x); | |
} | |
__device__ double threshold(double x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ float threshold(float x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ double where(bool c, double a, double b) { | |
return c ? a : b; | |
} | |
__device__ float where(bool c, float a, float b) { | |
return c ? a : b; | |
} | |
__device__ int64_t where(bool c, int64_t a, int64_t b) { | |
return c ? a : b; | |
} | |
__device__ double randLike(Philox& rnd) { | |
return uniform(rnd(), rnd()); | |
} | |
__device__ float randLikef(Philox& rnd) { | |
return uniformf(rnd()); | |
} | |
__device__ constexpr int64_t remainder(int64_t a, int64_t b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int remainder(int a, int b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int64_t fmod(int64_t a, int64_t b) { | |
return a % b; | |
} | |
__device__ constexpr int fmod(int a, int b) { | |
return a % b; | |
} | |
__device__ constexpr double fmod(double a, double b) { | |
return ::fmod(a, b); | |
} | |
__device__ constexpr float fmod(float a, float b) { | |
return ::fmod(a, b); | |
} | |
template <typename T> | |
__device__ T pow(T a, T b) { | |
if (b < 0) { | |
if (a == 1) { | |
return 1; | |
} else if (a == -1) { | |
auto negative = (-b) % static_cast<T>(2); | |
return negative ? -1 : 1; | |
} else { | |
return 0; | |
} | |
} else { | |
T result = 1; | |
while (b) { | |
if (b & 1) { | |
result *= a; | |
} | |
b /= 2; | |
a *= a; | |
} | |
return result; | |
} | |
} | |
template int pow<int>(int a, int b); | |
template int64_t pow<int64_t>(int64_t a, int64_t b); | |
template <> | |
float pow<float>(float a, float b) { | |
return ::pow(a, b); | |
} | |
template <> | |
double pow<double>(double a, double b) { | |
return ::pow(a, b); | |
} | |
namespace index_utils { | |
// Utility functions | |
// Total size of provided dimension | |
template <typename _dim3> | |
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) { | |
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z; | |
} | |
// Linearized indexing of idx based on dim, if bool==false that dimension does | |
// not participate | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = 0; | |
if (Z) | |
offset += idx.z; | |
if (Y) | |
offset = offset * dim.y + idx.y; | |
if (X) | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Linearized indexing of idx based on dim. All dimensions participate. | |
template <typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = idx.z; | |
offset = offset * dim.y + idx.y; | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Masks the provided dim3, those == false get truncated to 1 | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ dim3 maskedDims(const _dim3& dim) { | |
return dim3{ | |
X ? (unsigned)dim.x : 1U, | |
Y ? (unsigned)dim.y : 1U, | |
Z ? (unsigned)dim.z : 1U}; | |
} | |
// Provides total size of dim with masking, those dims == false do not | |
// participate in the size calculation | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3> | |
__device__ nvfuser_index_t maskedSize(const _dim3& dim) { | |
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim)); | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ bool maskedIsZero(const _dim3& idx) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == 0; | |
if (Y) | |
isZero = isZero && idx.y == 0; | |
if (Z) | |
isZero = isZero && idx.z == 0; | |
return isZero; | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == dim.x - 1; | |
if (Y) | |
isZero = isZero && idx.y == dim.y - 1; | |
if (Z) | |
isZero = isZero && idx.z == dim.z - 1; | |
return isZero; | |
} | |
} // namespace index_utils | |
// Default block synchronization. Just use __barrier_sync | |
namespace block_sync { | |
__forceinline__ __device__ void init() {} | |
// Thread-block synchronization | |
__forceinline__ __device__ void sync() { | |
__barrier_sync(0); | |
} | |
} // namespace block_sync | |
namespace grid_sync { | |
// Get the first bit in a 64 bit integer | |
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1)) | |
template <typename T> | |
__device__ T globalAsVolatile(volatile T& global_val) { | |
return global_val; | |
} | |
// A grid synchronization that can be called multiple times in a kernel assuming | |
// all the blocks fit on device at once. The semaphore is an integer semaphore | |
// assumed to be initialized to 0 before launching the kernel. The persistent | |
// option should be envoked if this sync will be called multiple times in one | |
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs | |
// called once in the same kernel does not require persistent mode. Segment size | |
// is the number of blocks participating in the sync in the dimensions marked by | |
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E. | |
// Marking X and Y but not Z means there should be Z semaphores of size X*Y. | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT> | |
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { | |
// Finish all global memory transactions before synchronizing | |
__threadfence(); | |
// Synchronize all threads in a block before synchronizing blocks | |
block_sync::sync(); | |
// Only allow linear_tid == 0 to participate in the synchronization | |
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { | |
// Get increment value, only want a single block to have the large | |
// increment, doesn't really matter which one, the goal is to flip/flop the | |
// first bit of a uint64_t value, since our semaphores are actualy int64_t | |
// we will just reinterpret_cast it to act as a uint64_t | |
uint64_t semaphore_increment = 1; | |
// Makes the assumption that blocks are in increasing order, this is not | |
// guaranteed by CUDA but this is the current behavior, and unlikely to | |
// change. | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1); | |
} | |
uint64_t oldArrive = | |
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment); | |
// If for persistent kernels, lock all blocks until the semaphore has been | |
// reached. Make sure we access semaphore as a volatile address so we get | |
// the global memory updates. | |
while ((PERSISTENT || last_block) && | |
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) == | |
0) { | |
// Put a sleep here so we have some breaks in probing the global | |
// semaphore, giving a better chance for other warps/blocks to catch up. | |
#if __CUDA_ARCH__ >= 700 | |
__nanosleep(200); | |
#else | |
// __nanosleep is not available for sm < 70 | |
assert(false); | |
#endif | |
} | |
} | |
// Sync block to make sure all other threads are waiting on the sync | |
block_sync::sync(); | |
} | |
} // namespace grid_sync | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. If set to false the dimension doesn't | |
// participate in the reduction. We could start with warp reductions, then | |
// reduce the warps, this could save some shared memory, but could be slower in | |
// some instances. | |
// | |
// EXAMPLE USAGE: | |
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS> | |
// (output[output_index], inputs[input_index], | |
// [] __device__ (T& a, const T b) { a += b; }); | |
// | |
// Note: We agressively template functions taking dim3 in the functions below | |
// because ROCM uses different types for the various dim3 and maps them | |
// directly to intrinsics, but they're dim3 when used after modification. | |
// | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
// Initialize shared memory | |
if (read_pred) { | |
shared_mem[smem_offset] = inp_val; | |
} else { | |
shared_mem[smem_offset] = init_val; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2 for the tree reduction: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T result = out; | |
reduction_op(result, shared_mem[smem_offset]); | |
if (reduction_size > 1) { | |
reduction_op(result, shared_mem[smem_offset + 1]); | |
} | |
out = result; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>( | |
out, | |
inp_val, | |
reduction_op, | |
thread_idx, | |
block_dim, | |
shared_mem, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// Inter-block reduction. | |
// | |
// The gridReduce function performs point-wise reductions of scalars across | |
// thread blocks. Thread blocks are disjointly partitioned into groups, | |
// "reduction segments", that are collectively defined by boolean template | |
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines | |
// whether thread blocks along the dimension should be grouped into the same | |
// reduction segment. Cross-block reducitons are independently done within each | |
// segment and generates distinctive results per segment. For instance, if all | |
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks | |
// since there will be just a single segment consisting of all thread blocks. If | |
// none of them are true, each thread block will become a segment by itself, so | |
// no reduction will be performed. | |
// | |
// The input scalars to reduce within each segment are a certain subset of | |
// thread-private scalars provided as part of the gridReduce function | |
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD, | |
// determine which subset of the scalars should be used for inter-block | |
// reductions. Specifically, all the input scalars of threads along each | |
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value | |
// held at offset 0 of each dimension will be used. Thus, for example, if all of | |
// X/Y/Z_THREAD are true, the scalars of all threads in each block will | |
// participate in inter-block reductions. If all of them are false, only one | |
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will | |
// be used. In the code below, we call the subset of threads a "reduction | |
// block". "Participating" thread dimensions here are similar to the | |
// "non-participating" block dimensions. They come from a block dimension that | |
// has not been reduced before hitting this grid reduction. | |
// | |
// Inter-block reductions perform point-wise reductions of scalars of reduction | |
// blocks within each reduction segment. More specifically, let rb be a | |
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx) | |
// denote the input scalar of thread at thread_idx and block_idx. The result of | |
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for | |
// each thread_idx in thread block block_idx_out in the segment as follows: | |
// | |
// OUT(thread_idx, block_idx_out) = | |
// Reduction of IN(thread_idx, block_idx) for | |
// all block_idx in a reduction segment | |
// | |
// OUT is not given for all threads that are not in block_idx_out and the | |
// reduction block. | |
// | |
// See also the function comment of gridReduce. | |
namespace reduction { | |
// Reduces all the reduction blocks in each reduction segment. This is the | |
// "cleanup" stage of a grid reduction. | |
// | |
// This is only called by one thread block per reduction segment. The input | |
// reduction blocks of the segment are stored in an intermediate buffer pointed | |
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction | |
// block is formed. | |
// | |
// The size of a reduction block is by definition smaller or equal to the size | |
// of a thread block. We use the remaining threads to parallelize reductions | |
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false, | |
// false}, we use blockDim.y*blockDim.z threads for each output value. This is | |
// done first by loading the input values in parallel and then by reducing | |
// across threads of dimensions whose XYZ_THREAD are false. | |
// | |
// Note that what is done here after the loading from global memory is similar | |
// to what the existing blockReduce function does. | |
template < | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T, | |
typename Func> | |
__device__ void gridReduceLastBlock( | |
T& out, | |
const T* in, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
Func reduction_op, | |
T* shared_buf, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp = init_val; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
reduction_op(inp, in[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_tmp = init_val; | |
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_tmp, | |
inp, | |
reduction_op, | |
threadIdx, | |
blockDim, | |
shared_buf, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
reduction_op(out, inp_tmp); | |
} | |
} | |
// Reduces per-thread values across thread blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - reduction_op: Scalar reduction function | |
// - work_buf: Temporary buffer for cross-block reductions | |
// - sync_flags: A vector of integers for synchronizations | |
// - shared_buf: Shared memory buffer for intra-block reduction | |
// | |
// Thread has valid results based on if it's the last block in the grid | |
// reduction dimension | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate | |
// in the cross-block reduction. Otherwise, only threads at offset 0 do. | |
// These are set to true if the dimension in the block has not been reduced | |
// previously in producer tensors, and does not participate in the reduction | |
// (right now they can't), so it's just a "pure" iteration domain as far as | |
// the grid reduce is concerned. | |
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or | |
// the result of the grid reduction will be broadcasted and used across the | |
// grid. These requires cross grid communication and the grid synchronizations | |
// here to actually synchronize across the entire grid. When false the grid is | |
// not synchronized, the last block just waits for everyone else to finish and | |
// the other blocks can exit early. | |
// - T: Scalar data type of input/output data | |
// - Func: Type of scalara reduction function | |
// | |
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are | |
// reduced together. We call it a reduction segment. Some examples are: | |
// | |
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which | |
// includes all thread blocks. It is effecively the same as the grid. | |
// | |
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an | |
// individual segment by itself. | |
// | |
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread | |
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z | |
// such segments. | |
// | |
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced | |
// with the sub regions of other thread blocks. We call it a reduction block. | |
// E.g., | |
// | |
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in | |
// the cross-block reductions. The reduction block is 1x1x1 with thread 0. | |
// | |
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block | |
// participate in the cross-block reductions. The reduction block in this case | |
// is equivalent to the thread block. | |
// | |
// After the function completes, only one thread block per reduction segment | |
// gets valid reduction results. There is no guarantee which particular block | |
// gets the final results. | |
// | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename Func> | |
__device__ void gridReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf[work_buf_offset] = inp_val; | |
} else { | |
work_buf[work_buf_offset] = init_val; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// Cleanup with block reduction | |
gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out, | |
(T*)work_buf, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
reduction_op, | |
shared_buf, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace reduction | |
#undef isize | |
#undef ioffset | |
namespace grid_broadcast { | |
// Broadcasts per-thread values across threads and blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - work_buf: Temporary buffer for communication across threads/blocks | |
// - sync_flags: A vector of integers for synchronizations | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z | |
// dimensions | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T> | |
__device__ void broadcast( | |
T& out, | |
const T& inp_val, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
bool read_write_pred) { | |
// Number of values broadcasted in the grid dimensions | |
const auto grid_seg_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the broadcast we're performing out of the grid_seg_size | |
const auto grid_seg_idx = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads not participating in a broadcast dimension, this is the | |
// number of thread entries to expect in the work buffer, therefore a striding | |
const auto block_stride = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
// Which broadcast in the block this is to line up the entry with the work | |
// buffer | |
const auto thread_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) && | |
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) && | |
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) && | |
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && | |
(!Z_THREAD || threadIdx.z == 0); | |
if (has_valid_data && read_write_pred) { | |
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val; | |
__threadfence(); | |
} | |
bool null = false; | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
if (read_write_pred) { | |
out = work_buf[grid_seg_idx * block_stride + thread_offset]; | |
} | |
// Make sure everyone has read from the buffer before continuing the kernel | |
// and potentially overwriting | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
} | |
} // namespace grid_broadcast | |
namespace broadcast { | |
// Broadcasts within partitioned groups of threads. | |
// | |
// X_THREAD: Broadcast from threadIdx.x == 0 if true | |
// Y_THREAD: Broadcast from threadIdx.y == 0 if true | |
// Z_THREAD: Broadcast from threadIdx.z == 0 if true | |
// inp_val: Per-thread source value. Only valid when the thread is a source. | |
// out: Per-thread output location | |
// | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T> | |
__device__ void blockBroadcast( | |
T& out, | |
const T& inp_val, | |
T* shared_mem, | |
bool read_write_pred) { | |
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && | |
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0); | |
const auto shared_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
if (has_valid_data && read_write_pred) { | |
shared_mem[shared_offset] = inp_val; | |
} | |
block_sync::sync(); | |
if (read_write_pred) { | |
out = shared_mem[shared_offset]; | |
} | |
block_sync::sync(); | |
} | |
} // namespace broadcast | |
// ----------------------------------------------------------------------------------------------- | |
// Block Welford Primitives | |
// ----------------------------------------------------------------------------------------------- | |
// Basic utility for welford update. Can be used to scan one value, or two merge | |
// two welford results | |
template <typename T, typename TN> | |
__inline__ __device__ void welfordCombine( | |
T& a_avg, | |
T& a_M2, | |
TN& a_N, | |
const T& b_avg, | |
const T& b_M2, | |
TN b_N) { | |
if (b_N == 0) { | |
return; | |
} | |
TN ab_N = a_N + b_N; | |
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N)); | |
T delta = b_avg - a_avg; | |
a_avg += delta * b_N_div_ab_N; | |
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N; | |
a_N = ab_N; | |
} | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
if (read_pred) { | |
shared_mem_avg[smem_offset] = in_avg; | |
shared_mem_M2[smem_offset] = in_M2; | |
shared_mem_N[smem_offset] = in_N; | |
} else { | |
shared_mem_avg[smem_offset] = init_val; | |
shared_mem_M2[smem_offset] = init_val; | |
shared_mem_N[smem_offset] = 0; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + np2], | |
shared_mem_M2[smem_offset + np2], | |
shared_mem_N[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + factor], | |
shared_mem_M2[smem_offset + factor], | |
shared_mem_N[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T res_avg = out_avg; | |
T res_M2 = out_M2; | |
TN res_N = out_N; | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset]); | |
if (reduction_size > 1) { | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset + 1], | |
shared_mem_M2[smem_offset + 1], | |
shared_mem_N[smem_offset + 1]); | |
} | |
out_avg = res_avg; | |
out_M2 = res_M2; | |
out_N = res_N; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_write_pred, | |
T init_val) { | |
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>( | |
out_avg, | |
out_M2, | |
out_N, | |
in_avg, | |
in_M2, | |
in_N, | |
thread_idx, | |
block_dim, | |
shared_mem_avg, | |
shared_mem_M2, | |
shared_mem_N, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// ----------------------------------------------------------------------------------------------- | |
// Grid Welford Prototype | |
// ----------------------------------------------------------------------------------------------- | |
namespace welford { | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN> | |
__device__ void gridWelfordLastBlock( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T* in_avg, | |
const T* in_M2, | |
const TN* in_N, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp_avg = init_val; | |
T inp_M2 = init_val; | |
TN inp_N = 0; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
welfordCombine( | |
inp_avg, | |
inp_M2, | |
inp_N, | |
in_avg[work_buf_offset], | |
in_M2[work_buf_offset], | |
in_N[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_avg_tmp = init_val; | |
T inp_M2_tmp = init_val; | |
TN inp_N_tmp = 0; | |
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_avg_tmp, | |
inp_M2_tmp, | |
inp_N_tmp, | |
inp_avg, | |
inp_M2, | |
inp_N, | |
threadIdx, | |
blockDim, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp); | |
} | |
} | |
// Grid welford combine | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename TN> | |
__device__ void gridWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& inp_avg, | |
const T& inp_M2, | |
const TN& inp_N, | |
volatile T* work_buf_avg, | |
volatile T* work_buf_M2, | |
volatile TN* work_buf_N, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_N += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf_avg[work_buf_offset] = inp_avg; | |
work_buf_M2[work_buf_offset] = inp_M2; | |
work_buf_N[work_buf_offset] = inp_N; | |
} else { | |
work_buf_avg[work_buf_offset] = init_val; | |
work_buf_M2[work_buf_offset] = init_val; | |
work_buf_N[work_buf_offset] = 0; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// final reduction | |
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out_avg, | |
out_M2, | |
out_N, | |
(T*)work_buf_avg, | |
(T*)work_buf_M2, | |
(TN*)work_buf_N, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace welford | |
#undef isize | |
#undef ioffset | |
namespace warp { | |
template < | |
bool SINGLE_WARP, | |
typename T, | |
typename Func, | |
typename _dim3ti, | |
typename _dim3bd> | |
__device__ void warpReduceTIDX( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3ti& thread_idx, | |
const _dim3bd& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
constexpr int WARP_SIZE = 32; | |
// Assume input padded to multiples of a warp | |
T reduce_val = init_val; | |
// Do warp reduction | |
if (read_write_pred) { | |
reduce_val = inp_val; | |
} | |
// Reduce within each warp | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE)); | |
} | |
// Reduce across warp if needed | |
// Load value to shared mem | |
if (!SINGLE_WARP) { | |
unsigned int warp_idx = thread_idx.x / WARP_SIZE; | |
unsigned int lane_idx = thread_idx.x % WARP_SIZE; | |
unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y; | |
bool is_warp_head = lane_idx == 0; | |
unsigned int reduction_size = block_dim.x; | |
unsigned int num_of_warps = reduction_size / WARP_SIZE; | |
unsigned int smem_offset = reduce_group_id * num_of_warps; | |
block_sync::sync(); | |
if (read_write_pred && is_warp_head) { | |
shared_mem[smem_offset + warp_idx] = reduce_val; | |
} | |
block_sync::sync(); | |
if (warp_idx == 0) { | |
// This assumes num_of_warps will be < 32, meaning < 1024 blocks. | |
// Should be true for long enough. | |
assert(num_of_warps <= 32); | |
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx] | |
: init_val; | |
// Reduce within warp 0 | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32)); | |
} | |
} | |
if (is_warp_head) { | |
reduction_op(out, reduce_val); | |
} | |
} else { | |
reduction_op(out, reduce_val); | |
} | |
} | |
} // namespace warp | |
// No "#pragma once" because this is a raw definition that can be copied by jit codegen. | |
// Eager mode clients should not include this file directly, instead, | |
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once. | |
// Stores RNG state values. Passed as a kernel argument. | |
// See Note [CUDA Graph-safe RNG states]. | |
// | |
// The raw definition lives in its own file so jit codegen can easily copy it. | |
namespace at { | |
struct PhiloxCudaState { | |
PhiloxCudaState() = default; | |
// Called if graph capture is not underway | |
PhiloxCudaState(uint64_t seed, | |
uint64_t offset) { | |
seed_ = seed; | |
offset_.val = offset; | |
} | |
// Called if graph capture is underway | |
PhiloxCudaState(uint64_t seed, | |
int64_t* offset_extragraph, | |
uint32_t offset_intragraph) { | |
seed_ = seed; | |
offset_.ptr = offset_extragraph; | |
offset_intragraph_ = offset_intragraph; | |
captured_ = true; | |
} | |
// Public members, directly accessible by at::cuda::philox::unpack. | |
// If we made them private with getters/setters, the getters/setters | |
// would have to be __device__, and we can't declare __device__ in ATen. | |
union Payload { | |
uint64_t val; | |
int64_t* ptr; | |
}; | |
uint64_t seed_ = 0; | |
Payload offset_; | |
uint32_t offset_intragraph_ = 0; | |
bool captured_ = false; | |
}; | |
} // namespace at | |
__global__ void kernel133(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<int, 0> T6, Tensor<int64_t, 0> T5) { | |
T6[0] | |
= where(T0[0], T1[0], T2[0]); | |
T5[0] | |
= where(T0[0], T3[0], T4[0]); | |
} | |
} | |
CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list: | |
function "CudaCodeGen::where(__nv_bool, double, double)" | |
function "CudaCodeGen::where(__nv_bool, float, float)" | |
function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)" | |
argument types are: (__nv_bool, int, int) | |
1 error detected in the compilation of "default_program". | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Error adding cache_after T2_g[ iS4{i13}, iS5{i16}, sbS6{1}, iS7{i22} ] we restrict using cache_after on an output. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_mean_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen { | |
typedef unsigned char uint8_t; | |
typedef signed char int8_t; | |
typedef short int int16_t; | |
typedef int int32_t; | |
typedef unsigned int uint32_t; | |
typedef long long int int64_t; | |
typedef unsigned long long int uint64_t; | |
typedef int nvfuser_index_t; | |
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_HALF_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __half; | |
__device__ __half __float2half(const float); | |
struct __align__(2) __half { | |
__half() = default; | |
__device__ __half(const float f) { | |
__x = __float2half(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __half __float2half(const float f) { | |
__half val; | |
asm("{ cvt.rn.f16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_HALF_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __half2float(const __half h) { | |
float val; | |
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h))); | |
return val; | |
} | |
// aligned vector generates vectorized load/store on CUDA | |
template <typename scalar_t, int vec_size> | |
struct alignas(sizeof(scalar_t) * vec_size) Array { | |
scalar_t val[vec_size]; | |
__device__ void set(scalar_t v) { | |
for (int i = 0; i < vec_size; ++i) { | |
val[i] = v; | |
} | |
} | |
}; | |
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_BFLOAT_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __bfloat; | |
__device__ __bfloat __float2bfloat(const float); | |
struct __align__(2) __bfloat { | |
__bfloat() = default; | |
__device__ __bfloat(const float f) { | |
__x = __float2bfloat(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __bfloat __float2bfloat(const float f) { | |
__bfloat val; | |
asm("{ cvt.rn.bf16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_BFLOAT_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __bfloat2float(const __bfloat h) { | |
float val; | |
asm("{ mov.b32 %0, {0,%1};}\n" | |
: "=f"(val) | |
: "h"(__NVFUSER_BFLOAT_TO_CUS(h))); | |
return val; | |
} | |
template <typename T, int N> | |
struct Tensor { | |
__device__ T& operator[](nvfuser_index_t ind) { | |
return data[ind]; | |
}; | |
T* data; | |
nvfuser_index_t size[N]; | |
nvfuser_index_t stride[N]; | |
}; | |
// Specialization for 0-dim case as it does not need size and stride arrays. | |
// They will be an error as well since zero-length arrays are not allowed. | |
template <typename T> | |
struct Tensor<T, 0> { | |
__device__ T& operator[](nvfuser_index_t) { | |
return *data; | |
}; | |
T* data; | |
}; | |
class Philox { | |
public: | |
__device__ Philox( | |
unsigned long long seed, | |
unsigned long long subsequence, | |
unsigned long long offset) { | |
key.x = (unsigned int)seed; | |
key.y = (unsigned int)(seed >> 32); | |
counter = make_uint4(0, 0, 0, 0); | |
counter.z = (unsigned int)(subsequence); | |
counter.w = (unsigned int)(subsequence >> 32); | |
STATE = 0; | |
incr_n(offset / 4); | |
} | |
__device__ unsigned long operator()() { | |
if (STATE == 0) { | |
uint4 counter_ = counter; | |
uint2 key_ = key; | |
for (int i = 0; i < 9; i++) { | |
counter_ = single_round(counter_, key_); | |
key_.x += (kPhilox10A); | |
key_.y += (kPhilox10B); | |
} | |
output = single_round(counter_, key_); | |
incr(); | |
} | |
unsigned long ret = 0; | |
switch (STATE) { | |
case 0: | |
ret = output.x; | |
break; | |
case 1: | |
ret = output.y; | |
break; | |
case 2: | |
ret = output.z; | |
break; | |
case 3: | |
ret = output.w; | |
break; | |
} | |
STATE = (STATE + 1) % 4; | |
return ret; | |
} | |
private: | |
__device__ void incr_n(unsigned long long n) { | |
unsigned int nlo = (unsigned int)(n); | |
unsigned int nhi = (unsigned int)(n >> 32); | |
counter.x += nlo; | |
if (counter.x < nlo) | |
nhi++; | |
counter.y += nhi; | |
if (nhi <= counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ void incr() { | |
if (++counter.x) | |
return; | |
if (++counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ unsigned int mulhilo32( | |
unsigned int a, | |
unsigned int b, | |
unsigned int* result_high) { | |
*result_high = __umulhi(a, b); | |
return a * b; | |
} | |
__device__ uint4 single_round(uint4 ctr, uint2 key) { | |
unsigned int hi0; | |
unsigned int hi1; | |
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); | |
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); | |
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; | |
return ret; | |
} | |
private: | |
static constexpr unsigned long kPhilox10A = 0x9E3779B9; | |
static constexpr unsigned long kPhilox10B = 0xBB67AE85; | |
static constexpr unsigned long kPhiloxSA = 0xD2511F53; | |
static constexpr unsigned long kPhiloxSB = 0xCD9E8D57; | |
uint4 counter = {}; | |
uint4 output = {}; | |
uint2 key = {}; | |
unsigned int STATE = 0; | |
}; | |
__device__ float uniformf(unsigned int x) { | |
constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32. | |
return x * kRanInvM32; | |
} | |
__device__ double uniform(unsigned int x, unsigned int y) { | |
constexpr double kRan2Pow53Inv = 1.1102230246251565e-16; | |
const unsigned long long z = | |
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32)); | |
return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0); | |
} | |
#define NVFUSER_DEFINE_MAGIC_ZERO \ | |
__shared__ int nvfuser_zero_s; \ | |
if (threadIdx.x == 0) \ | |
nvfuser_zero_s = 0; \ | |
__syncthreads(); \ | |
atomicMin(&nvfuser_zero_s, threadIdx.x); \ | |
int nvfuser_zero = nvfuser_zero_s; | |
#define NVFUSER_UPDATE_MAGIC_ZERO \ | |
do { \ | |
nvfuser_zero <<= 1; \ | |
} while (0); | |
__device__ constexpr int ceilDiv(int a, int b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int b) { | |
return ceilDiv(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t ceilDiv(int a, int64_t b) { | |
return ceilDiv((int64_t)a, b); | |
} | |
__device__ constexpr int max(int a, int b) { | |
return ::max(a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int b) { | |
return ::max(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t max(int a, int64_t b) { | |
return ::max((int64_t)a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int64_t b) { | |
return ::max(a, b); | |
} | |
__device__ double fmax(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ float fmax(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ constexpr int min(int a, int b) { | |
return ::min(a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int b) { | |
return ::min(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t min(int a, int64_t b) { | |
return ::min((int64_t)a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int64_t b) { | |
return ::min(a, b); | |
} | |
__device__ double fmin(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ float fmin(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ constexpr int alignBufferSize(int buffer, int size) { | |
return (buffer + (size - 1)) & ~(size - 1); | |
} | |
__device__ double clamp(double x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ float clamp(float x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ double frac(double x) { | |
return x - trunc(x); | |
} | |
__device__ float frac(float x) { | |
return x - trunc(x); | |
} | |
__device__ double gelu(double x) { | |
return x * normcdf(x); | |
} | |
__device__ float gelu(float x) { | |
return x * normcdf(x); | |
} | |
__device__ double reciprocal(double x) { | |
return 1 / x; | |
} | |
__device__ float reciprocal(float x) { | |
return 1 / x; | |
} | |
__device__ double relu(double x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(float x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int64_t x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ double remainder(double a, double b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ float remainder(float a, float b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ double sigmoid(double x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ float sigmoid(float x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ double silu(double x) { | |
return x * sigmoid(x); | |
} | |
__device__ float silu(float x) { | |
return x * sigmoid(x); | |
} | |
__device__ double threshold(double x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ float threshold(float x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ double where(bool c, double a, double b) { | |
return c ? a : b; | |
} | |
__device__ float where(bool c, float a, float b) { | |
return c ? a : b; | |
} | |
__device__ int64_t where(bool c, int64_t a, int64_t b) { | |
return c ? a : b; | |
} | |
__device__ double randLike(Philox& rnd) { | |
return uniform(rnd(), rnd()); | |
} | |
__device__ float randLikef(Philox& rnd) { | |
return uniformf(rnd()); | |
} | |
__device__ constexpr int64_t remainder(int64_t a, int64_t b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int remainder(int a, int b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int64_t fmod(int64_t a, int64_t b) { | |
return a % b; | |
} | |
__device__ constexpr int fmod(int a, int b) { | |
return a % b; | |
} | |
__device__ constexpr double fmod(double a, double b) { | |
return ::fmod(a, b); | |
} | |
__device__ constexpr float fmod(float a, float b) { | |
return ::fmod(a, b); | |
} | |
template <typename T> | |
__device__ T pow(T a, T b) { | |
if (b < 0) { | |
if (a == 1) { | |
return 1; | |
} else if (a == -1) { | |
auto negative = (-b) % static_cast<T>(2); | |
return negative ? -1 : 1; | |
} else { | |
return 0; | |
} | |
} else { | |
T result = 1; | |
while (b) { | |
if (b & 1) { | |
result *= a; | |
} | |
b /= 2; | |
a *= a; | |
} | |
return result; | |
} | |
} | |
template int pow<int>(int a, int b); | |
template int64_t pow<int64_t>(int64_t a, int64_t b); | |
template <> | |
float pow<float>(float a, float b) { | |
return ::pow(a, b); | |
} | |
template <> | |
double pow<double>(double a, double b) { | |
return ::pow(a, b); | |
} | |
namespace index_utils { | |
// Utility functions | |
// Total size of provided dimension | |
template <typename _dim3> | |
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) { | |
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z; | |
} | |
// Linearized indexing of idx based on dim, if bool==false that dimension does | |
// not participate | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = 0; | |
if (Z) | |
offset += idx.z; | |
if (Y) | |
offset = offset * dim.y + idx.y; | |
if (X) | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Linearized indexing of idx based on dim. All dimensions participate. | |
template <typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = idx.z; | |
offset = offset * dim.y + idx.y; | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Masks the provided dim3, those == false get truncated to 1 | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ dim3 maskedDims(const _dim3& dim) { | |
return dim3{ | |
X ? (unsigned)dim.x : 1U, | |
Y ? (unsigned)dim.y : 1U, | |
Z ? (unsigned)dim.z : 1U}; | |
} | |
// Provides total size of dim with masking, those dims == false do not | |
// participate in the size calculation | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3> | |
__device__ nvfuser_index_t maskedSize(const _dim3& dim) { | |
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim)); | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ bool maskedIsZero(const _dim3& idx) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == 0; | |
if (Y) | |
isZero = isZero && idx.y == 0; | |
if (Z) | |
isZero = isZero && idx.z == 0; | |
return isZero; | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == dim.x - 1; | |
if (Y) | |
isZero = isZero && idx.y == dim.y - 1; | |
if (Z) | |
isZero = isZero && idx.z == dim.z - 1; | |
return isZero; | |
} | |
} // namespace index_utils | |
// Default block synchronization. Just use __barrier_sync | |
namespace block_sync { | |
__forceinline__ __device__ void init() {} | |
// Thread-block synchronization | |
__forceinline__ __device__ void sync() { | |
__barrier_sync(0); | |
} | |
} // namespace block_sync | |
namespace grid_sync { | |
// Get the first bit in a 64 bit integer | |
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1)) | |
template <typename T> | |
__device__ T globalAsVolatile(volatile T& global_val) { | |
return global_val; | |
} | |
// A grid synchronization that can be called multiple times in a kernel assuming | |
// all the blocks fit on device at once. The semaphore is an integer semaphore | |
// assumed to be initialized to 0 before launching the kernel. The persistent | |
// option should be envoked if this sync will be called multiple times in one | |
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs | |
// called once in the same kernel does not require persistent mode. Segment size | |
// is the number of blocks participating in the sync in the dimensions marked by | |
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E. | |
// Marking X and Y but not Z means there should be Z semaphores of size X*Y. | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT> | |
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { | |
// Finish all global memory transactions before synchronizing | |
__threadfence(); | |
// Synchronize all threads in a block before synchronizing blocks | |
block_sync::sync(); | |
// Only allow linear_tid == 0 to participate in the synchronization | |
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { | |
// Get increment value, only want a single block to have the large | |
// increment, doesn't really matter which one, the goal is to flip/flop the | |
// first bit of a uint64_t value, since our semaphores are actualy int64_t | |
// we will just reinterpret_cast it to act as a uint64_t | |
uint64_t semaphore_increment = 1; | |
// Makes the assumption that blocks are in increasing order, this is not | |
// guaranteed by CUDA but this is the current behavior, and unlikely to | |
// change. | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1); | |
} | |
uint64_t oldArrive = | |
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment); | |
// If for persistent kernels, lock all blocks until the semaphore has been | |
// reached. Make sure we access semaphore as a volatile address so we get | |
// the global memory updates. | |
while ((PERSISTENT || last_block) && | |
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) == | |
0) { | |
// Put a sleep here so we have some breaks in probing the global | |
// semaphore, giving a better chance for other warps/blocks to catch up. | |
#if __CUDA_ARCH__ >= 700 | |
__nanosleep(200); | |
#else | |
// __nanosleep is not available for sm < 70 | |
assert(false); | |
#endif | |
} | |
} | |
// Sync block to make sure all other threads are waiting on the sync | |
block_sync::sync(); | |
} | |
} // namespace grid_sync | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. If set to false the dimension doesn't | |
// participate in the reduction. We could start with warp reductions, then | |
// reduce the warps, this could save some shared memory, but could be slower in | |
// some instances. | |
// | |
// EXAMPLE USAGE: | |
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS> | |
// (output[output_index], inputs[input_index], | |
// [] __device__ (T& a, const T b) { a += b; }); | |
// | |
// Note: We agressively template functions taking dim3 in the functions below | |
// because ROCM uses different types for the various dim3 and maps them | |
// directly to intrinsics, but they're dim3 when used after modification. | |
// | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
// Initialize shared memory | |
if (read_pred) { | |
shared_mem[smem_offset] = inp_val; | |
} else { | |
shared_mem[smem_offset] = init_val; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2 for the tree reduction: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T result = out; | |
reduction_op(result, shared_mem[smem_offset]); | |
if (reduction_size > 1) { | |
reduction_op(result, shared_mem[smem_offset + 1]); | |
} | |
out = result; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>( | |
out, | |
inp_val, | |
reduction_op, | |
thread_idx, | |
block_dim, | |
shared_mem, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// Inter-block reduction. | |
// | |
// The gridReduce function performs point-wise reductions of scalars across | |
// thread blocks. Thread blocks are disjointly partitioned into groups, | |
// "reduction segments", that are collectively defined by boolean template | |
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines | |
// whether thread blocks along the dimension should be grouped into the same | |
// reduction segment. Cross-block reducitons are independently done within each | |
// segment and generates distinctive results per segment. For instance, if all | |
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks | |
// since there will be just a single segment consisting of all thread blocks. If | |
// none of them are true, each thread block will become a segment by itself, so | |
// no reduction will be performed. | |
// | |
// The input scalars to reduce within each segment are a certain subset of | |
// thread-private scalars provided as part of the gridReduce function | |
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD, | |
// determine which subset of the scalars should be used for inter-block | |
// reductions. Specifically, all the input scalars of threads along each | |
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value | |
// held at offset 0 of each dimension will be used. Thus, for example, if all of | |
// X/Y/Z_THREAD are true, the scalars of all threads in each block will | |
// participate in inter-block reductions. If all of them are false, only one | |
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will | |
// be used. In the code below, we call the subset of threads a "reduction | |
// block". "Participating" thread dimensions here are similar to the | |
// "non-participating" block dimensions. They come from a block dimension that | |
// has not been reduced before hitting this grid reduction. | |
// | |
// Inter-block reductions perform point-wise reductions of scalars of reduction | |
// blocks within each reduction segment. More specifically, let rb be a | |
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx) | |
// denote the input scalar of thread at thread_idx and block_idx. The result of | |
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for | |
// each thread_idx in thread block block_idx_out in the segment as follows: | |
// | |
// OUT(thread_idx, block_idx_out) = | |
// Reduction of IN(thread_idx, block_idx) for | |
// all block_idx in a reduction segment | |
// | |
// OUT is not given for all threads that are not in block_idx_out and the | |
// reduction block. | |
// | |
// See also the function comment of gridReduce. | |
namespace reduction { | |
// Reduces all the reduction blocks in each reduction segment. This is the | |
// "cleanup" stage of a grid reduction. | |
// | |
// This is only called by one thread block per reduction segment. The input | |
// reduction blocks of the segment are stored in an intermediate buffer pointed | |
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction | |
// block is formed. | |
// | |
// The size of a reduction block is by definition smaller or equal to the size | |
// of a thread block. We use the remaining threads to parallelize reductions | |
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false, | |
// false}, we use blockDim.y*blockDim.z threads for each output value. This is | |
// done first by loading the input values in parallel and then by reducing | |
// across threads of dimensions whose XYZ_THREAD are false. | |
// | |
// Note that what is done here after the loading from global memory is similar | |
// to what the existing blockReduce function does. | |
template < | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T, | |
typename Func> | |
__device__ void gridReduceLastBlock( | |
T& out, | |
const T* in, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
Func reduction_op, | |
T* shared_buf, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp = init_val; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
reduction_op(inp, in[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_tmp = init_val; | |
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_tmp, | |
inp, | |
reduction_op, | |
threadIdx, | |
blockDim, | |
shared_buf, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
reduction_op(out, inp_tmp); | |
} | |
} | |
// Reduces per-thread values across thread blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - reduction_op: Scalar reduction function | |
// - work_buf: Temporary buffer for cross-block reductions | |
// - sync_flags: A vector of integers for synchronizations | |
// - shared_buf: Shared memory buffer for intra-block reduction | |
// | |
// Thread has valid results based on if it's the last block in the grid | |
// reduction dimension | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate | |
// in the cross-block reduction. Otherwise, only threads at offset 0 do. | |
// These are set to true if the dimension in the block has not been reduced | |
// previously in producer tensors, and does not participate in the reduction | |
// (right now they can't), so it's just a "pure" iteration domain as far as | |
// the grid reduce is concerned. | |
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or | |
// the result of the grid reduction will be broadcasted and used across the | |
// grid. These requires cross grid communication and the grid synchronizations | |
// here to actually synchronize across the entire grid. When false the grid is | |
// not synchronized, the last block just waits for everyone else to finish and | |
// the other blocks can exit early. | |
// - T: Scalar data type of input/output data | |
// - Func: Type of scalara reduction function | |
// | |
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are | |
// reduced together. We call it a reduction segment. Some examples are: | |
// | |
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which | |
// includes all thread blocks. It is effecively the same as the grid. | |
// | |
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an | |
// individual segment by itself. | |
// | |
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread | |
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z | |
// such segments. | |
// | |
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced | |
// with the sub regions of other thread blocks. We call it a reduction block. | |
// E.g., | |
// | |
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in | |
// the cross-block reductions. The reduction block is 1x1x1 with thread 0. | |
// | |
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block | |
// participate in the cross-block reductions. The reduction block in this case | |
// is equivalent to the thread block. | |
// | |
// After the function completes, only one thread block per reduction segment | |
// gets valid reduction results. There is no guarantee which particular block | |
// gets the final results. | |
// | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename Func> | |
__device__ void gridReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf[work_buf_offset] = inp_val; | |
} else { | |
work_buf[work_buf_offset] = init_val; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// Cleanup with block reduction | |
gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out, | |
(T*)work_buf, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
reduction_op, | |
shared_buf, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace reduction | |
#undef isize | |
#undef ioffset | |
namespace grid_broadcast { | |
// Broadcasts per-thread values across threads and blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - work_buf: Temporary buffer for communication across threads/blocks | |
// - sync_flags: A vector of integers for synchronizations | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z | |
// dimensions | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T> | |
__device__ void broadcast( | |
T& out, | |
const T& inp_val, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
bool read_write_pred) { | |
// Number of values broadcasted in the grid dimensions | |
const auto grid_seg_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the broadcast we're performing out of the grid_seg_size | |
const auto grid_seg_idx = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads not participating in a broadcast dimension, this is the | |
// number of thread entries to expect in the work buffer, therefore a striding | |
const auto block_stride = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
// Which broadcast in the block this is to line up the entry with the work | |
// buffer | |
const auto thread_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) && | |
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) && | |
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) && | |
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && | |
(!Z_THREAD || threadIdx.z == 0); | |
if (has_valid_data && read_write_pred) { | |
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val; | |
__threadfence(); | |
} | |
bool null = false; | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
if (read_write_pred) { | |
out = work_buf[grid_seg_idx * block_stride + thread_offset]; | |
} | |
// Make sure everyone has read from the buffer before continuing the kernel | |
// and potentially overwriting | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
} | |
} // namespace grid_broadcast | |
namespace broadcast { | |
// Broadcasts within partitioned groups of threads. | |
// | |
// X_THREAD: Broadcast from threadIdx.x == 0 if true | |
// Y_THREAD: Broadcast from threadIdx.y == 0 if true | |
// Z_THREAD: Broadcast from threadIdx.z == 0 if true | |
// inp_val: Per-thread source value. Only valid when the thread is a source. | |
// out: Per-thread output location | |
// | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T> | |
__device__ void blockBroadcast( | |
T& out, | |
const T& inp_val, | |
T* shared_mem, | |
bool read_write_pred) { | |
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && | |
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0); | |
const auto shared_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
if (has_valid_data && read_write_pred) { | |
shared_mem[shared_offset] = inp_val; | |
} | |
block_sync::sync(); | |
if (read_write_pred) { | |
out = shared_mem[shared_offset]; | |
} | |
block_sync::sync(); | |
} | |
} // namespace broadcast | |
// ----------------------------------------------------------------------------------------------- | |
// Block Welford Primitives | |
// ----------------------------------------------------------------------------------------------- | |
// Basic utility for welford update. Can be used to scan one value, or two merge | |
// two welford results | |
template <typename T, typename TN> | |
__inline__ __device__ void welfordCombine( | |
T& a_avg, | |
T& a_M2, | |
TN& a_N, | |
const T& b_avg, | |
const T& b_M2, | |
TN b_N) { | |
if (b_N == 0) { | |
return; | |
} | |
TN ab_N = a_N + b_N; | |
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N)); | |
T delta = b_avg - a_avg; | |
a_avg += delta * b_N_div_ab_N; | |
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N; | |
a_N = ab_N; | |
} | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
if (read_pred) { | |
shared_mem_avg[smem_offset] = in_avg; | |
shared_mem_M2[smem_offset] = in_M2; | |
shared_mem_N[smem_offset] = in_N; | |
} else { | |
shared_mem_avg[smem_offset] = init_val; | |
shared_mem_M2[smem_offset] = init_val; | |
shared_mem_N[smem_offset] = 0; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + np2], | |
shared_mem_M2[smem_offset + np2], | |
shared_mem_N[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + factor], | |
shared_mem_M2[smem_offset + factor], | |
shared_mem_N[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T res_avg = out_avg; | |
T res_M2 = out_M2; | |
TN res_N = out_N; | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset]); | |
if (reduction_size > 1) { | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset + 1], | |
shared_mem_M2[smem_offset + 1], | |
shared_mem_N[smem_offset + 1]); | |
} | |
out_avg = res_avg; | |
out_M2 = res_M2; | |
out_N = res_N; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_write_pred, | |
T init_val) { | |
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>( | |
out_avg, | |
out_M2, | |
out_N, | |
in_avg, | |
in_M2, | |
in_N, | |
thread_idx, | |
block_dim, | |
shared_mem_avg, | |
shared_mem_M2, | |
shared_mem_N, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// ----------------------------------------------------------------------------------------------- | |
// Grid Welford Prototype | |
// ----------------------------------------------------------------------------------------------- | |
namespace welford { | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN> | |
__device__ void gridWelfordLastBlock( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T* in_avg, | |
const T* in_M2, | |
const TN* in_N, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp_avg = init_val; | |
T inp_M2 = init_val; | |
TN inp_N = 0; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
welfordCombine( | |
inp_avg, | |
inp_M2, | |
inp_N, | |
in_avg[work_buf_offset], | |
in_M2[work_buf_offset], | |
in_N[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_avg_tmp = init_val; | |
T inp_M2_tmp = init_val; | |
TN inp_N_tmp = 0; | |
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_avg_tmp, | |
inp_M2_tmp, | |
inp_N_tmp, | |
inp_avg, | |
inp_M2, | |
inp_N, | |
threadIdx, | |
blockDim, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp); | |
} | |
} | |
// Grid welford combine | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename TN> | |
__device__ void gridWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& inp_avg, | |
const T& inp_M2, | |
const TN& inp_N, | |
volatile T* work_buf_avg, | |
volatile T* work_buf_M2, | |
volatile TN* work_buf_N, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_N += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf_avg[work_buf_offset] = inp_avg; | |
work_buf_M2[work_buf_offset] = inp_M2; | |
work_buf_N[work_buf_offset] = inp_N; | |
} else { | |
work_buf_avg[work_buf_offset] = init_val; | |
work_buf_M2[work_buf_offset] = init_val; | |
work_buf_N[work_buf_offset] = 0; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// final reduction | |
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out_avg, | |
out_M2, | |
out_N, | |
(T*)work_buf_avg, | |
(T*)work_buf_M2, | |
(TN*)work_buf_N, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace welford | |
#undef isize | |
#undef ioffset | |
namespace warp { | |
template < | |
bool SINGLE_WARP, | |
typename T, | |
typename Func, | |
typename _dim3ti, | |
typename _dim3bd> | |
__device__ void warpReduceTIDX( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3ti& thread_idx, | |
const _dim3bd& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
constexpr int WARP_SIZE = 32; | |
// Assume input padded to multiples of a warp | |
T reduce_val = init_val; | |
// Do warp reduction | |
if (read_write_pred) { | |
reduce_val = inp_val; | |
} | |
// Reduce within each warp | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE)); | |
} | |
// Reduce across warp if needed | |
// Load value to shared mem | |
if (!SINGLE_WARP) { | |
unsigned int warp_idx = thread_idx.x / WARP_SIZE; | |
unsigned int lane_idx = thread_idx.x % WARP_SIZE; | |
unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y; | |
bool is_warp_head = lane_idx == 0; | |
unsigned int reduction_size = block_dim.x; | |
unsigned int num_of_warps = reduction_size / WARP_SIZE; | |
unsigned int smem_offset = reduce_group_id * num_of_warps; | |
block_sync::sync(); | |
if (read_write_pred && is_warp_head) { | |
shared_mem[smem_offset + warp_idx] = reduce_val; | |
} | |
block_sync::sync(); | |
if (warp_idx == 0) { | |
// This assumes num_of_warps will be < 32, meaning < 1024 blocks. | |
// Should be true for long enough. | |
assert(num_of_warps <= 32); | |
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx] | |
: init_val; | |
// Reduce within warp 0 | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32)); | |
} | |
} | |
if (is_warp_head) { | |
reduction_op(out, reduce_val); | |
} | |
} else { | |
reduction_op(out, reduce_val); | |
} | |
} | |
} // namespace warp | |
// No "#pragma once" because this is a raw definition that can be copied by jit codegen. | |
// Eager mode clients should not include this file directly, instead, | |
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once. | |
// Stores RNG state values. Passed as a kernel argument. | |
// See Note [CUDA Graph-safe RNG states]. | |
// | |
// The raw definition lives in its own file so jit codegen can easily copy it. | |
namespace at { | |
struct PhiloxCudaState { | |
PhiloxCudaState() = default; | |
// Called if graph capture is not underway | |
PhiloxCudaState(uint64_t seed, | |
uint64_t offset) { | |
seed_ = seed; | |
offset_.val = offset; | |
} | |
// Called if graph capture is underway | |
PhiloxCudaState(uint64_t seed, | |
int64_t* offset_extragraph, | |
uint32_t offset_intragraph) { | |
seed_ = seed; | |
offset_.ptr = offset_extragraph; | |
offset_intragraph_ = offset_intragraph; | |
captured_ = true; | |
} | |
// Public members, directly accessible by at::cuda::philox::unpack. | |
// If we made them private with getters/setters, the getters/setters | |
// would have to be __device__, and we can't declare __device__ in ATen. | |
union Payload { | |
uint64_t val; | |
int64_t* ptr; | |
}; | |
uint64_t seed_ = 0; | |
Payload offset_; | |
uint32_t offset_intragraph_ = 0; | |
bool captured_ = false; | |
}; | |
} // namespace at | |
__global__ void kernel187(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<bool, 0> T3) { | |
T3[0] | |
= where(T0[0], T1[0], T2[0]); | |
} | |
} | |
CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list: | |
function "CudaCodeGen::where(__nv_bool, double, double)" | |
function "CudaCodeGen::where(__nv_bool, float, float)" | |
function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)" | |
argument types are: (__nv_bool, __nv_bool, __nv_bool) | |
1 error detected in the compilation of "default_program". | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__ | |
raise RuntimeError(msg) | |
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float32! Caching allocator allocated memory was 1485824 and is now reported as 1513984 on device 0. CUDA driver allocated memory was 1715470336 and is now 1717567488. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen { | |
typedef unsigned char uint8_t; | |
typedef signed char int8_t; | |
typedef short int int16_t; | |
typedef int int32_t; | |
typedef unsigned int uint32_t; | |
typedef long long int int64_t; | |
typedef unsigned long long int uint64_t; | |
typedef int nvfuser_index_t; | |
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_HALF_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __half; | |
__device__ __half __float2half(const float); | |
struct __align__(2) __half { | |
__half() = default; | |
__device__ __half(const float f) { | |
__x = __float2half(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __half __float2half(const float f) { | |
__half val; | |
asm("{ cvt.rn.f16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_HALF_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __half2float(const __half h) { | |
float val; | |
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h))); | |
return val; | |
} | |
// aligned vector generates vectorized load/store on CUDA | |
template <typename scalar_t, int vec_size> | |
struct alignas(sizeof(scalar_t) * vec_size) Array { | |
scalar_t val[vec_size]; | |
__device__ void set(scalar_t v) { | |
for (int i = 0; i < vec_size; ++i) { | |
val[i] = v; | |
} | |
} | |
}; | |
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var))) | |
#define __NVFUSER_BFLOAT_TO_CUS(var) \ | |
*(reinterpret_cast<const unsigned short*>(&(var))) | |
struct __bfloat; | |
__device__ __bfloat __float2bfloat(const float); | |
struct __align__(2) __bfloat { | |
__bfloat() = default; | |
__device__ __bfloat(const float f) { | |
__x = __float2bfloat(f).__x; | |
} | |
protected: | |
unsigned short __x; | |
}; | |
__device__ __bfloat __float2bfloat(const float f) { | |
__bfloat val; | |
asm("{ cvt.rn.bf16.f32 %0, %1;}\n" | |
: "=h"(__NVFUSER_BFLOAT_TO_US(val)) | |
: "f"(f)); | |
return val; | |
} | |
__device__ float __bfloat2float(const __bfloat h) { | |
float val; | |
asm("{ mov.b32 %0, {0,%1};}\n" | |
: "=f"(val) | |
: "h"(__NVFUSER_BFLOAT_TO_CUS(h))); | |
return val; | |
} | |
template <typename T, int N> | |
struct Tensor { | |
__device__ T& operator[](nvfuser_index_t ind) { | |
return data[ind]; | |
}; | |
T* data; | |
nvfuser_index_t size[N]; | |
nvfuser_index_t stride[N]; | |
}; | |
// Specialization for 0-dim case as it does not need size and stride arrays. | |
// They will be an error as well since zero-length arrays are not allowed. | |
template <typename T> | |
struct Tensor<T, 0> { | |
__device__ T& operator[](nvfuser_index_t) { | |
return *data; | |
}; | |
T* data; | |
}; | |
class Philox { | |
public: | |
__device__ Philox( | |
unsigned long long seed, | |
unsigned long long subsequence, | |
unsigned long long offset) { | |
key.x = (unsigned int)seed; | |
key.y = (unsigned int)(seed >> 32); | |
counter = make_uint4(0, 0, 0, 0); | |
counter.z = (unsigned int)(subsequence); | |
counter.w = (unsigned int)(subsequence >> 32); | |
STATE = 0; | |
incr_n(offset / 4); | |
} | |
__device__ unsigned long operator()() { | |
if (STATE == 0) { | |
uint4 counter_ = counter; | |
uint2 key_ = key; | |
for (int i = 0; i < 9; i++) { | |
counter_ = single_round(counter_, key_); | |
key_.x += (kPhilox10A); | |
key_.y += (kPhilox10B); | |
} | |
output = single_round(counter_, key_); | |
incr(); | |
} | |
unsigned long ret = 0; | |
switch (STATE) { | |
case 0: | |
ret = output.x; | |
break; | |
case 1: | |
ret = output.y; | |
break; | |
case 2: | |
ret = output.z; | |
break; | |
case 3: | |
ret = output.w; | |
break; | |
} | |
STATE = (STATE + 1) % 4; | |
return ret; | |
} | |
private: | |
__device__ void incr_n(unsigned long long n) { | |
unsigned int nlo = (unsigned int)(n); | |
unsigned int nhi = (unsigned int)(n >> 32); | |
counter.x += nlo; | |
if (counter.x < nlo) | |
nhi++; | |
counter.y += nhi; | |
if (nhi <= counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ void incr() { | |
if (++counter.x) | |
return; | |
if (++counter.y) | |
return; | |
if (++counter.z) | |
return; | |
++counter.w; | |
} | |
__device__ unsigned int mulhilo32( | |
unsigned int a, | |
unsigned int b, | |
unsigned int* result_high) { | |
*result_high = __umulhi(a, b); | |
return a * b; | |
} | |
__device__ uint4 single_round(uint4 ctr, uint2 key) { | |
unsigned int hi0; | |
unsigned int hi1; | |
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); | |
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); | |
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; | |
return ret; | |
} | |
private: | |
static constexpr unsigned long kPhilox10A = 0x9E3779B9; | |
static constexpr unsigned long kPhilox10B = 0xBB67AE85; | |
static constexpr unsigned long kPhiloxSA = 0xD2511F53; | |
static constexpr unsigned long kPhiloxSB = 0xCD9E8D57; | |
uint4 counter = {}; | |
uint4 output = {}; | |
uint2 key = {}; | |
unsigned int STATE = 0; | |
}; | |
__device__ float uniformf(unsigned int x) { | |
constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32. | |
return x * kRanInvM32; | |
} | |
__device__ double uniform(unsigned int x, unsigned int y) { | |
constexpr double kRan2Pow53Inv = 1.1102230246251565e-16; | |
const unsigned long long z = | |
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32)); | |
return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0); | |
} | |
#define NVFUSER_DEFINE_MAGIC_ZERO \ | |
__shared__ int nvfuser_zero_s; \ | |
if (threadIdx.x == 0) \ | |
nvfuser_zero_s = 0; \ | |
__syncthreads(); \ | |
atomicMin(&nvfuser_zero_s, threadIdx.x); \ | |
int nvfuser_zero = nvfuser_zero_s; | |
#define NVFUSER_UPDATE_MAGIC_ZERO \ | |
do { \ | |
nvfuser_zero <<= 1; \ | |
} while (0); | |
__device__ constexpr int ceilDiv(int a, int b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) { | |
return (a + b - 1) / b; | |
} | |
__device__ constexpr int64_t ceilDiv(int64_t a, int b) { | |
return ceilDiv(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t ceilDiv(int a, int64_t b) { | |
return ceilDiv((int64_t)a, b); | |
} | |
__device__ constexpr int max(int a, int b) { | |
return ::max(a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int b) { | |
return ::max(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t max(int a, int64_t b) { | |
return ::max((int64_t)a, b); | |
} | |
__device__ constexpr int64_t max(int64_t a, int64_t b) { | |
return ::max(a, b); | |
} | |
__device__ double fmax(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ float fmax(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmax(a, b); | |
} | |
} | |
__device__ constexpr int min(int a, int b) { | |
return ::min(a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int b) { | |
return ::min(a, (int64_t)b); | |
} | |
__device__ constexpr int64_t min(int a, int64_t b) { | |
return ::min((int64_t)a, b); | |
} | |
__device__ constexpr int64_t min(int64_t a, int64_t b) { | |
return ::min(a, b); | |
} | |
__device__ double fmin(double a, double b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ float fmin(float a, float b) { | |
// check and propagate NaN | |
if (a != a) { | |
return a; | |
} else if (b != b) { | |
return b; | |
} else { | |
return ::fmin(a, b); | |
} | |
} | |
__device__ constexpr int alignBufferSize(int buffer, int size) { | |
return (buffer + (size - 1)) & ~(size - 1); | |
} | |
__device__ double clamp(double x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ float clamp(float x, double minv, double maxv) { | |
return x < minv ? minv : (x > maxv ? maxv : x); | |
} | |
__device__ double frac(double x) { | |
return x - trunc(x); | |
} | |
__device__ float frac(float x) { | |
return x - trunc(x); | |
} | |
__device__ double gelu(double x) { | |
return x * normcdf(x); | |
} | |
__device__ float gelu(float x) { | |
return x * normcdf(x); | |
} | |
__device__ double reciprocal(double x) { | |
return 1 / x; | |
} | |
__device__ float reciprocal(float x) { | |
return 1 / x; | |
} | |
__device__ double relu(double x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(float x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int64_t x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ float relu(int x) { | |
return x <= 0 ? 0 : x; | |
} | |
__device__ double remainder(double a, double b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ float remainder(float a, float b) { | |
auto mod = ::fmod(a, b); | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ double sigmoid(double x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ float sigmoid(float x) { | |
return 1 / (1 + exp(-x)); | |
} | |
__device__ double silu(double x) { | |
return x * sigmoid(x); | |
} | |
__device__ float silu(float x) { | |
return x * sigmoid(x); | |
} | |
__device__ double threshold(double x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ float threshold(float x, double t, double v) { | |
return x <= t ? v : x; | |
} | |
__device__ double where(bool c, double a, double b) { | |
return c ? a : b; | |
} | |
__device__ float where(bool c, float a, float b) { | |
return c ? a : b; | |
} | |
__device__ int64_t where(bool c, int64_t a, int64_t b) { | |
return c ? a : b; | |
} | |
__device__ double randLike(Philox& rnd) { | |
return uniform(rnd(), rnd()); | |
} | |
__device__ float randLikef(Philox& rnd) { | |
return uniformf(rnd()); | |
} | |
__device__ constexpr int64_t remainder(int64_t a, int64_t b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int remainder(int a, int b) { | |
auto mod = a % b; | |
if ((mod != 0) && ((b < 0) != (mod < 0))) | |
mod += b; | |
return mod; | |
} | |
__device__ constexpr int64_t fmod(int64_t a, int64_t b) { | |
return a % b; | |
} | |
__device__ constexpr int fmod(int a, int b) { | |
return a % b; | |
} | |
__device__ constexpr double fmod(double a, double b) { | |
return ::fmod(a, b); | |
} | |
__device__ constexpr float fmod(float a, float b) { | |
return ::fmod(a, b); | |
} | |
template <typename T> | |
__device__ T pow(T a, T b) { | |
if (b < 0) { | |
if (a == 1) { | |
return 1; | |
} else if (a == -1) { | |
auto negative = (-b) % static_cast<T>(2); | |
return negative ? -1 : 1; | |
} else { | |
return 0; | |
} | |
} else { | |
T result = 1; | |
while (b) { | |
if (b & 1) { | |
result *= a; | |
} | |
b /= 2; | |
a *= a; | |
} | |
return result; | |
} | |
} | |
template int pow<int>(int a, int b); | |
template int64_t pow<int64_t>(int64_t a, int64_t b); | |
template <> | |
float pow<float>(float a, float b) { | |
return ::pow(a, b); | |
} | |
template <> | |
double pow<double>(double a, double b) { | |
return ::pow(a, b); | |
} | |
namespace index_utils { | |
// Utility functions | |
// Total size of provided dimension | |
template <typename _dim3> | |
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) { | |
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z; | |
} | |
// Linearized indexing of idx based on dim, if bool==false that dimension does | |
// not participate | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = 0; | |
if (Z) | |
offset += idx.z; | |
if (Y) | |
offset = offset * dim.y + idx.y; | |
if (X) | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Linearized indexing of idx based on dim. All dimensions participate. | |
template <typename _dim3, typename _dim3_2> | |
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) { | |
nvfuser_index_t offset = idx.z; | |
offset = offset * dim.y + idx.y; | |
offset = offset * dim.x + idx.x; | |
return offset; | |
} | |
// Masks the provided dim3, those == false get truncated to 1 | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ dim3 maskedDims(const _dim3& dim) { | |
return dim3{ | |
X ? (unsigned)dim.x : 1U, | |
Y ? (unsigned)dim.y : 1U, | |
Z ? (unsigned)dim.z : 1U}; | |
} | |
// Provides total size of dim with masking, those dims == false do not | |
// participate in the size calculation | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3> | |
__device__ nvfuser_index_t maskedSize(const _dim3& dim) { | |
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim)); | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3> | |
__device__ bool maskedIsZero(const _dim3& idx) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == 0; | |
if (Y) | |
isZero = isZero && idx.y == 0; | |
if (Z) | |
isZero = isZero && idx.z == 0; | |
return isZero; | |
} | |
// Checks if provided idx is zero on those dims == true | |
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2> | |
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) { | |
bool isZero = true; | |
if (X) | |
isZero = isZero && idx.x == dim.x - 1; | |
if (Y) | |
isZero = isZero && idx.y == dim.y - 1; | |
if (Z) | |
isZero = isZero && idx.z == dim.z - 1; | |
return isZero; | |
} | |
} // namespace index_utils | |
// Default block synchronization. Just use __barrier_sync | |
namespace block_sync { | |
__forceinline__ __device__ void init() {} | |
// Thread-block synchronization | |
__forceinline__ __device__ void sync() { | |
__barrier_sync(0); | |
} | |
} // namespace block_sync | |
namespace grid_sync { | |
// Get the first bit in a 64 bit integer | |
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1)) | |
template <typename T> | |
__device__ T globalAsVolatile(volatile T& global_val) { | |
return global_val; | |
} | |
// A grid synchronization that can be called multiple times in a kernel assuming | |
// all the blocks fit on device at once. The semaphore is an integer semaphore | |
// assumed to be initialized to 0 before launching the kernel. The persistent | |
// option should be envoked if this sync will be called multiple times in one | |
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs | |
// called once in the same kernel does not require persistent mode. Segment size | |
// is the number of blocks participating in the sync in the dimensions marked by | |
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E. | |
// Marking X and Y but not Z means there should be Z semaphores of size X*Y. | |
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT> | |
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { | |
// Finish all global memory transactions before synchronizing | |
__threadfence(); | |
// Synchronize all threads in a block before synchronizing blocks | |
block_sync::sync(); | |
// Only allow linear_tid == 0 to participate in the synchronization | |
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { | |
// Get increment value, only want a single block to have the large | |
// increment, doesn't really matter which one, the goal is to flip/flop the | |
// first bit of a uint64_t value, since our semaphores are actualy int64_t | |
// we will just reinterpret_cast it to act as a uint64_t | |
uint64_t semaphore_increment = 1; | |
// Makes the assumption that blocks are in increasing order, this is not | |
// guaranteed by CUDA but this is the current behavior, and unlikely to | |
// change. | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1); | |
} | |
uint64_t oldArrive = | |
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment); | |
// If for persistent kernels, lock all blocks until the semaphore has been | |
// reached. Make sure we access semaphore as a volatile address so we get | |
// the global memory updates. | |
while ((PERSISTENT || last_block) && | |
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) == | |
0) { | |
// Put a sleep here so we have some breaks in probing the global | |
// semaphore, giving a better chance for other warps/blocks to catch up. | |
#if __CUDA_ARCH__ >= 700 | |
__nanosleep(200); | |
#else | |
// __nanosleep is not available for sm < 70 | |
assert(false); | |
#endif | |
} | |
} | |
// Sync block to make sure all other threads are waiting on the sync | |
block_sync::sync(); | |
} | |
} // namespace grid_sync | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. If set to false the dimension doesn't | |
// participate in the reduction. We could start with warp reductions, then | |
// reduce the warps, this could save some shared memory, but could be slower in | |
// some instances. | |
// | |
// EXAMPLE USAGE: | |
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS> | |
// (output[output_index], inputs[input_index], | |
// [] __device__ (T& a, const T b) { a += b; }); | |
// | |
// Note: We agressively template functions taking dim3 in the functions below | |
// because ROCM uses different types for the various dim3 and maps them | |
// directly to intrinsics, but they're dim3 when used after modification. | |
// | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
// Initialize shared memory | |
if (read_pred) { | |
shared_mem[smem_offset] = inp_val; | |
} else { | |
shared_mem[smem_offset] = init_val; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2 for the tree reduction: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T result = out; | |
reduction_op(result, shared_mem[smem_offset]); | |
if (reduction_size > 1) { | |
reduction_op(result, shared_mem[smem_offset + 1]); | |
} | |
out = result; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename Func, | |
typename _dim3, | |
typename _dim3_2> | |
__device__ void blockReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>( | |
out, | |
inp_val, | |
reduction_op, | |
thread_idx, | |
block_dim, | |
shared_mem, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// Inter-block reduction. | |
// | |
// The gridReduce function performs point-wise reductions of scalars across | |
// thread blocks. Thread blocks are disjointly partitioned into groups, | |
// "reduction segments", that are collectively defined by boolean template | |
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines | |
// whether thread blocks along the dimension should be grouped into the same | |
// reduction segment. Cross-block reducitons are independently done within each | |
// segment and generates distinctive results per segment. For instance, if all | |
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks | |
// since there will be just a single segment consisting of all thread blocks. If | |
// none of them are true, each thread block will become a segment by itself, so | |
// no reduction will be performed. | |
// | |
// The input scalars to reduce within each segment are a certain subset of | |
// thread-private scalars provided as part of the gridReduce function | |
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD, | |
// determine which subset of the scalars should be used for inter-block | |
// reductions. Specifically, all the input scalars of threads along each | |
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value | |
// held at offset 0 of each dimension will be used. Thus, for example, if all of | |
// X/Y/Z_THREAD are true, the scalars of all threads in each block will | |
// participate in inter-block reductions. If all of them are false, only one | |
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will | |
// be used. In the code below, we call the subset of threads a "reduction | |
// block". "Participating" thread dimensions here are similar to the | |
// "non-participating" block dimensions. They come from a block dimension that | |
// has not been reduced before hitting this grid reduction. | |
// | |
// Inter-block reductions perform point-wise reductions of scalars of reduction | |
// blocks within each reduction segment. More specifically, let rb be a | |
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx) | |
// denote the input scalar of thread at thread_idx and block_idx. The result of | |
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for | |
// each thread_idx in thread block block_idx_out in the segment as follows: | |
// | |
// OUT(thread_idx, block_idx_out) = | |
// Reduction of IN(thread_idx, block_idx) for | |
// all block_idx in a reduction segment | |
// | |
// OUT is not given for all threads that are not in block_idx_out and the | |
// reduction block. | |
// | |
// See also the function comment of gridReduce. | |
namespace reduction { | |
// Reduces all the reduction blocks in each reduction segment. This is the | |
// "cleanup" stage of a grid reduction. | |
// | |
// This is only called by one thread block per reduction segment. The input | |
// reduction blocks of the segment are stored in an intermediate buffer pointed | |
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction | |
// block is formed. | |
// | |
// The size of a reduction block is by definition smaller or equal to the size | |
// of a thread block. We use the remaining threads to parallelize reductions | |
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false, | |
// false}, we use blockDim.y*blockDim.z threads for each output value. This is | |
// done first by loading the input values in parallel and then by reducing | |
// across threads of dimensions whose XYZ_THREAD are false. | |
// | |
// Note that what is done here after the loading from global memory is similar | |
// to what the existing blockReduce function does. | |
template < | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T, | |
typename Func> | |
__device__ void gridReduceLastBlock( | |
T& out, | |
const T* in, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
Func reduction_op, | |
T* shared_buf, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp = init_val; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
reduction_op(inp, in[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_tmp = init_val; | |
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_tmp, | |
inp, | |
reduction_op, | |
threadIdx, | |
blockDim, | |
shared_buf, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
reduction_op(out, inp_tmp); | |
} | |
} | |
// Reduces per-thread values across thread blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - reduction_op: Scalar reduction function | |
// - work_buf: Temporary buffer for cross-block reductions | |
// - sync_flags: A vector of integers for synchronizations | |
// - shared_buf: Shared memory buffer for intra-block reduction | |
// | |
// Thread has valid results based on if it's the last block in the grid | |
// reduction dimension | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate | |
// in the cross-block reduction. Otherwise, only threads at offset 0 do. | |
// These are set to true if the dimension in the block has not been reduced | |
// previously in producer tensors, and does not participate in the reduction | |
// (right now they can't), so it's just a "pure" iteration domain as far as | |
// the grid reduce is concerned. | |
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or | |
// the result of the grid reduction will be broadcasted and used across the | |
// grid. These requires cross grid communication and the grid synchronizations | |
// here to actually synchronize across the entire grid. When false the grid is | |
// not synchronized, the last block just waits for everyone else to finish and | |
// the other blocks can exit early. | |
// - T: Scalar data type of input/output data | |
// - Func: Type of scalara reduction function | |
// | |
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are | |
// reduced together. We call it a reduction segment. Some examples are: | |
// | |
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which | |
// includes all thread blocks. It is effecively the same as the grid. | |
// | |
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an | |
// individual segment by itself. | |
// | |
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread | |
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z | |
// such segments. | |
// | |
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced | |
// with the sub regions of other thread blocks. We call it a reduction block. | |
// E.g., | |
// | |
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in | |
// the cross-block reductions. The reduction block is 1x1x1 with thread 0. | |
// | |
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block | |
// participate in the cross-block reductions. The reduction block in this case | |
// is equivalent to the thread block. | |
// | |
// After the function completes, only one thread block per reduction segment | |
// gets valid reduction results. There is no guarantee which particular block | |
// gets the final results. | |
// | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename Func> | |
__device__ void gridReduce( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf[work_buf_offset] = inp_val; | |
} else { | |
work_buf[work_buf_offset] = init_val; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// Cleanup with block reduction | |
gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out, | |
(T*)work_buf, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
reduction_op, | |
shared_buf, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace reduction | |
#undef isize | |
#undef ioffset | |
namespace grid_broadcast { | |
// Broadcasts per-thread values across threads and blocks. | |
// | |
// Function parameters: | |
// - out: Per-thread output location | |
// - inp_val: Per-thread input value | |
// - work_buf: Temporary buffer for communication across threads/blocks | |
// - sync_flags: A vector of integers for synchronizations | |
// | |
// Template parameters: | |
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z | |
// dimensions | |
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z | |
// dimensions | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
typename T> | |
__device__ void broadcast( | |
T& out, | |
const T& inp_val, | |
volatile T* work_buf, | |
Tensor<int64_t, 1> sync_flags, | |
bool read_write_pred) { | |
// Number of values broadcasted in the grid dimensions | |
const auto grid_seg_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the broadcast we're performing out of the grid_seg_size | |
const auto grid_seg_idx = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads not participating in a broadcast dimension, this is the | |
// number of thread entries to expect in the work buffer, therefore a striding | |
const auto block_stride = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
// Which broadcast in the block this is to line up the entry with the work | |
// buffer | |
const auto thread_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) && | |
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) && | |
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) && | |
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && | |
(!Z_THREAD || threadIdx.z == 0); | |
if (has_valid_data && read_write_pred) { | |
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val; | |
__threadfence(); | |
} | |
bool null = false; | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
if (read_write_pred) { | |
out = work_buf[grid_seg_idx * block_stride + thread_offset]; | |
} | |
// Make sure everyone has read from the buffer before continuing the kernel | |
// and potentially overwriting | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>( | |
sync_flags[grid_seg_idx], grid_seg_size); | |
} | |
} // namespace grid_broadcast | |
namespace broadcast { | |
// Broadcasts within partitioned groups of threads. | |
// | |
// X_THREAD: Broadcast from threadIdx.x == 0 if true | |
// Y_THREAD: Broadcast from threadIdx.y == 0 if true | |
// Z_THREAD: Broadcast from threadIdx.z == 0 if true | |
// inp_val: Per-thread source value. Only valid when the thread is a source. | |
// out: Per-thread output location | |
// | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T> | |
__device__ void blockBroadcast( | |
T& out, | |
const T& inp_val, | |
T* shared_mem, | |
bool read_write_pred) { | |
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && | |
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0); | |
const auto shared_offset = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
if (has_valid_data && read_write_pred) { | |
shared_mem[shared_offset] = inp_val; | |
} | |
block_sync::sync(); | |
if (read_write_pred) { | |
out = shared_mem[shared_offset]; | |
} | |
block_sync::sync(); | |
} | |
} // namespace broadcast | |
// ----------------------------------------------------------------------------------------------- | |
// Block Welford Primitives | |
// ----------------------------------------------------------------------------------------------- | |
// Basic utility for welford update. Can be used to scan one value, or two merge | |
// two welford results | |
template <typename T, typename TN> | |
__inline__ __device__ void welfordCombine( | |
T& a_avg, | |
T& a_M2, | |
TN& a_N, | |
const T& b_avg, | |
const T& b_M2, | |
TN b_N) { | |
if (b_N == 0) { | |
return; | |
} | |
TN ab_N = a_N + b_N; | |
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N)); | |
T delta = b_avg - a_avg; | |
a_avg += delta * b_N_div_ab_N; | |
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N; | |
a_N = ab_N; | |
} | |
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x | |
// dimension of the block. | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// If this thread will output a final result | |
bool should_write = | |
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx); | |
// Size of the reduction segments | |
unsigned int reduction_size = | |
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim); | |
// Index into the reduction segment | |
unsigned int reduction_tid = | |
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>( | |
thread_idx, block_dim); | |
// Index of the reduction segment | |
unsigned int reduction_idx = | |
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>( | |
thread_idx, block_dim); | |
// Offset into smem for the current thread | |
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid; | |
if (read_pred) { | |
shared_mem_avg[smem_offset] = in_avg; | |
shared_mem_M2[smem_offset] = in_M2; | |
shared_mem_N[smem_offset] = in_N; | |
} else { | |
shared_mem_avg[smem_offset] = init_val; | |
shared_mem_M2[smem_offset] = init_val; | |
shared_mem_N[smem_offset] = 0; | |
} | |
block_sync::sync(); | |
// Reduce down to nearest power of 2: | |
int np2 = 1 << (31 - __clz(reduction_size)); | |
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + np2], | |
shared_mem_M2[smem_offset + np2], | |
shared_mem_N[smem_offset + np2]); | |
} | |
block_sync::sync(); | |
// loop peel the final iteration to save one syncthread for the end | |
for (int factor = np2 / 2; factor > 1; factor >>= 1) { | |
if (reduction_tid < factor) { | |
welfordCombine( | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset], | |
shared_mem_avg[smem_offset + factor], | |
shared_mem_M2[smem_offset + factor], | |
shared_mem_N[smem_offset + factor]); | |
} | |
block_sync::sync(); | |
} | |
if (should_write && write_pred) { | |
T res_avg = out_avg; | |
T res_M2 = out_M2; | |
TN res_N = out_N; | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset], | |
shared_mem_M2[smem_offset], | |
shared_mem_N[smem_offset]); | |
if (reduction_size > 1) { | |
welfordCombine( | |
res_avg, | |
res_M2, | |
res_N, | |
shared_mem_avg[smem_offset + 1], | |
shared_mem_M2[smem_offset + 1], | |
shared_mem_N[smem_offset + 1]); | |
} | |
out_avg = res_avg; | |
out_M2 = res_M2; | |
out_N = res_N; | |
} | |
block_sync::sync(); | |
} | |
// Use the same pred for both reads and writes | |
template < | |
bool X_REDUCE, | |
bool Y_REDUCE, | |
bool Z_REDUCE, | |
typename T, | |
typename TN, | |
typename _dim3, | |
typename _dim3_2> | |
__inline__ __device__ void blockWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& in_avg, | |
const T& in_M2, | |
const TN& in_N, | |
const _dim3& thread_idx, | |
const _dim3_2& block_dim, | |
T* shared_mem_avg, | |
T* shared_mem_M2, | |
TN* shared_mem_N, | |
bool read_write_pred, | |
T init_val) { | |
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>( | |
out_avg, | |
out_M2, | |
out_N, | |
in_avg, | |
in_M2, | |
in_N, | |
thread_idx, | |
block_dim, | |
shared_mem_avg, | |
shared_mem_M2, | |
shared_mem_N, | |
read_write_pred, | |
read_write_pred, | |
init_val); | |
} | |
// ----------------------------------------------------------------------------------------------- | |
// Grid Welford Prototype | |
// ----------------------------------------------------------------------------------------------- | |
namespace welford { | |
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN> | |
__device__ void gridWelfordLastBlock( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T* in_avg, | |
const T* in_M2, | |
const TN* in_N, | |
const nvfuser_index_t | |
grid_reduction_segment_size, // Number of reductions across | |
// grid reduce dimensions | |
const nvfuser_index_t | |
block_reduction_segment_size, // Number of reductions across the block | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool write_pred, | |
T init_val) { | |
// We have to do num_reductions across reduction_size. The reductions are | |
// contiguous, but offset by reduction_size. There is an entry in "in" for | |
// every block, and every thread marked as true. Threads in dimensions marked | |
// as false can be used to parallelize the reduction. | |
// Find the reduction id of the participating threads | |
const auto block_reduction_segment_idx = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
// Find an id associated within a reduction segment for all | |
// "non-participating" threads, which will parallelize the reductions for the | |
// "participating" threads | |
const auto id_in_block_segment = | |
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
threadIdx, blockDim); | |
// Stride by the "non-participating" threads | |
const auto input_stride_for_thread_in_segment = | |
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim); | |
T inp_avg = init_val; | |
T inp_M2 = init_val; | |
TN inp_N = 0; | |
// Block stride across the reduction until we only have one value per thread | |
for (nvfuser_index_t reduction_i = id_in_block_segment; | |
reduction_i < grid_reduction_segment_size; | |
reduction_i += input_stride_for_thread_in_segment) { | |
auto work_buf_offset = reduction_i * block_reduction_segment_size + | |
block_reduction_segment_idx; | |
welfordCombine( | |
inp_avg, | |
inp_M2, | |
inp_N, | |
in_avg[work_buf_offset], | |
in_M2[work_buf_offset], | |
in_N[work_buf_offset]); | |
} | |
// Block reduce the per thread values into per "participating" thread values | |
T inp_avg_tmp = init_val; | |
T inp_M2_tmp = init_val; | |
TN inp_N_tmp = 0; | |
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>( | |
inp_avg_tmp, | |
inp_M2_tmp, | |
inp_N_tmp, | |
inp_avg, | |
inp_M2, | |
inp_N, | |
threadIdx, | |
blockDim, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
true, | |
init_val); | |
const bool should_write = (X_THREAD || threadIdx.x == 0) && | |
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0); | |
if (should_write && write_pred) { | |
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp); | |
} | |
} | |
// Grid welford combine | |
template < | |
bool X_BLOCK, | |
bool Y_BLOCK, | |
bool Z_BLOCK, | |
bool X_THREAD, | |
bool Y_THREAD, | |
bool Z_THREAD, | |
bool PERSISTENT_REDUCTION, | |
typename T, | |
typename TN> | |
__device__ void gridWelford( | |
T& out_avg, | |
T& out_M2, | |
TN& out_N, | |
const T& inp_avg, | |
const T& inp_M2, | |
const TN& inp_N, | |
volatile T* work_buf_avg, | |
volatile T* work_buf_M2, | |
volatile TN* work_buf_N, | |
Tensor<int64_t, 1> sync_flags, | |
T* shared_buf_avg, | |
T* shared_buf_M2, | |
TN* shared_buf_N, | |
bool read_pred, | |
bool write_pred, | |
T init_val) { | |
// Number of values to reduce in the reduction segment | |
const auto grid_reduction_segment_size = | |
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim); | |
// Index of the reduction we're performing out of the | |
// grid_reduction_segment_size | |
const auto idx_in_grid_segment = | |
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>( | |
blockIdx, gridDim); | |
// Number of threads we can use in final reduction, Seems to assume all | |
// threads in the block participate | |
const auto block_reduction_segment_size = | |
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim); | |
// advance to the offset for this segment | |
// index of reduction * size of the reduction * size of threads | |
work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
work_buf_N += idx_in_grid_segment * grid_reduction_segment_size * | |
block_reduction_segment_size; | |
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && | |
(Z_THREAD || threadIdx.z == 0)) { | |
auto block_offset = | |
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
auto thread_offset = | |
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>( | |
threadIdx, blockDim); | |
auto work_buf_offset = | |
block_offset * block_reduction_segment_size + thread_offset; | |
if (read_pred) { | |
work_buf_avg[work_buf_offset] = inp_avg; | |
work_buf_M2[work_buf_offset] = inp_M2; | |
work_buf_N[work_buf_offset] = inp_N; | |
} else { | |
work_buf_avg[work_buf_offset] = init_val; | |
work_buf_M2[work_buf_offset] = init_val; | |
work_buf_N[work_buf_offset] = 0; | |
} | |
} | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
bool last_block = | |
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim); | |
if (last_block) { | |
// final reduction | |
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>( | |
out_avg, | |
out_M2, | |
out_N, | |
(T*)work_buf_avg, | |
(T*)work_buf_M2, | |
(TN*)work_buf_N, | |
grid_reduction_segment_size, | |
block_reduction_segment_size, | |
shared_buf_avg, | |
shared_buf_M2, | |
shared_buf_N, | |
write_pred, | |
init_val); | |
} | |
if (PERSISTENT_REDUCTION) { | |
// Make sure we're done with global memory before we allow the kernel to | |
// continue | |
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>( | |
sync_flags[idx_in_grid_segment], grid_reduction_segment_size); | |
} | |
} | |
} // namespace welford | |
#undef isize | |
#undef ioffset | |
namespace warp { | |
template < | |
bool SINGLE_WARP, | |
typename T, | |
typename Func, | |
typename _dim3ti, | |
typename _dim3bd> | |
__device__ void warpReduceTIDX( | |
T& out, | |
const T& inp_val, | |
Func reduction_op, | |
const _dim3ti& thread_idx, | |
const _dim3bd& block_dim, | |
T* shared_mem, | |
bool read_write_pred, | |
T init_val) { | |
constexpr int WARP_SIZE = 32; | |
// Assume input padded to multiples of a warp | |
T reduce_val = init_val; | |
// Do warp reduction | |
if (read_write_pred) { | |
reduce_val = inp_val; | |
} | |
// Reduce within each warp | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE)); | |
} | |
// Reduce across warp if needed | |
// Load value to shared mem | |
if (!SINGLE_WARP) { | |
unsigned int warp_idx = thread_idx.x / WARP_SIZE; | |
unsigned int lane_idx = thread_idx.x % WARP_SIZE; | |
unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y; | |
bool is_warp_head = lane_idx == 0; | |
unsigned int reduction_size = block_dim.x; | |
unsigned int num_of_warps = reduction_size / WARP_SIZE; | |
unsigned int smem_offset = reduce_group_id * num_of_warps; | |
block_sync::sync(); | |
if (read_write_pred && is_warp_head) { | |
shared_mem[smem_offset + warp_idx] = reduce_val; | |
} | |
block_sync::sync(); | |
if (warp_idx == 0) { | |
// This assumes num_of_warps will be < 32, meaning < 1024 blocks. | |
// Should be true for long enough. | |
assert(num_of_warps <= 32); | |
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx] | |
: init_val; | |
// Reduce within warp 0 | |
for (int i = 16; i >= 1; i /= 2) { | |
reduction_op( | |
reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32)); | |
} | |
} | |
if (is_warp_head) { | |
reduction_op(out, reduce_val); | |
} | |
} else { | |
reduction_op(out, reduce_val); | |
} | |
} | |
} // namespace warp | |
// No "#pragma once" because this is a raw definition that can be copied by jit codegen. | |
// Eager mode clients should not include this file directly, instead, | |
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once. | |
// Stores RNG state values. Passed as a kernel argument. | |
// See Note [CUDA Graph-safe RNG states]. | |
// | |
// The raw definition lives in its own file so jit codegen can easily copy it. | |
namespace at { | |
struct PhiloxCudaState { | |
PhiloxCudaState() = default; | |
// Called if graph capture is not underway | |
PhiloxCudaState(uint64_t seed, | |
uint64_t offset) { | |
seed_ = seed; | |
offset_.val = offset; | |
} | |
// Called if graph capture is underway | |
PhiloxCudaState(uint64_t seed, | |
int64_t* offset_extragraph, | |
uint32_t offset_intragraph) { | |
seed_ = seed; | |
offset_.ptr = offset_extragraph; | |
offset_intragraph_ = offset_intragraph; | |
captured_ = true; | |
} | |
// Public members, directly accessible by at::cuda::philox::unpack. | |
// If we made them private with getters/setters, the getters/setters | |
// would have to be __device__, and we can't declare __device__ in ATen. | |
union Payload { | |
uint64_t val; | |
int64_t* ptr; | |
}; | |
uint64_t seed_ = 0; | |
Payload offset_; | |
uint32_t offset_intragraph_ = 0; | |
bool captured_ = false; | |
}; | |
} // namespace at | |
__global__ void kernel212(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int, 0> T3) { | |
T3[0] | |
= where(T0[0], T1[0], T2[0]); | |
} | |
} | |
CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list: | |
function "CudaCodeGen::where(__nv_bool, double, double)" | |
function "CudaCodeGen::where(__nv_bool, float, float)" | |
function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)" | |
argument types are: (__nv_bool, int, int) | |
1 error detected in the compilation of "default_program". | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch. | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: CUDA driver error: invalid resource handle | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: CUDA driver error: invalid resource handle | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Expected kernel_ to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time | |
====================================================================== | |
ERROR: test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_addcmul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: int | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Could not generate a max op for tensor with type: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Could not generate a max op for tensor with type: int | |
====================================================================== | |
ERROR: test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Tried to reduce a 0-dim tensor | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: double to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: float to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: double to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: bool | |
====================================================================== | |
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported. | |
====================================================================== | |
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for | |
dbs = parent.get_debug_state() | |
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper | |
method(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test | |
raise rte | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test | |
result = test(self, **param_kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper | |
return test(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness | |
trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn | |
traced_fn.last_graph = traced.graph_for(*inputs_tensors) # type: ignore[attr-defined] | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for | |
return _script_method_graph_for(self, self, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for | |
self(*args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call | |
return prof_callable(func_call, *args, **kwargs) | |
File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable | |
return callable(*args, **kwargs) | |
RuntimeError: Illegal Cast value from DataType: int64_t to DataType: int | |
==== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment