davidberard98/nvfuser-opinfo-rebased.txt

## nvfuser-opinfo-rebased.txt
srun: job 21221 queued and waiting for resources
srun: job 21221 has been allocated resources
srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device
srun: error: Not using a pseudo-terminal, disregarding --pty option
monkeytype is not installed. Skipping tests for Profile-Directed Typing
test_nvfuser_correctness_H_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___radd___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_tensor.py:627: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(other, dtype=dtype, device=self.device) ** self
ERROR
test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:333: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int16! Caching allocator allocated memory was 467456 and is now reported as 495616 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int8! Caching allocator allocated memory was 495616 and is now reported as 523776 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_uint8! Caching allocator allocated memory was 523776 and is now reported as 551936 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:336: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_bfloat16! Caching allocator allocated memory was 551936 and is now reported as 580096 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_float16! Caching allocator allocated memory was 580096 and is now reported as 608256 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_float32! Caching allocator allocated memory was 608256 and is now reported as 636416 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_float64! Caching allocator allocated memory was 636416 and is now reported as 664576 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:338: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.iinfo(dtype).max, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int16! Caching allocator allocated memory was 664576 and is now reported as 692736 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:331: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(-torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:386: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  elif mask.shape != input.shape:
ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_mean_cuda_complex64! Caching allocator allocated memory was 888832 and is now reported as 956928 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_mean_cuda_int16! Caching allocator allocated memory was 956928 and is now reported as 1025024 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_mean_cuda_int8! Caching allocator allocated memory was 1037312 and is now reported as 1105408 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_mean_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_mean_cuda_uint8! Caching allocator allocated memory was 1105408 and is now reported as 1173504 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:351: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(0, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:350: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_bfloat16! Caching allocator allocated memory was 1173504 and is now reported as 1314304 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float16! Caching allocator allocated memory was 1314304 and is now reported as 1455104 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float32! Caching allocator allocated memory was 1455104 and is now reported as 1595904 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float64! Caching allocator allocated memory was 1595904 and is now reported as 1736704 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_bfloat16! Caching allocator allocated memory was 1736704 and is now reported as 1764864 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex128! Caching allocator allocated memory was 1765376 and is now reported as 1793536 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex64! Caching allocator allocated memory was 1793536 and is now reported as 1821696 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float16! Caching allocator allocated memory was 1821696 and is now reported as 1849856 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float32! Caching allocator allocated memory was 1849856 and is now reported as 1878016 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float64! Caching allocator allocated memory was 1878016 and is now reported as 1906176 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int16! Caching allocator allocated memory was 1906176 and is now reported as 1934336 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int64! Caching allocator allocated memory was 1934848 and is now reported as 1963008 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int8! Caching allocator allocated memory was 1963008 and is now reported as 1991168 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_uint8! Caching allocator allocated memory was 1991168 and is now reported as 2019328 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex128! Caching allocator allocated memory was 2101760 and is now reported as 2129920 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex64! Caching allocator allocated memory was 2129920 and is now reported as 2158080 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int16! Caching allocator allocated memory was 2158080 and is now reported as 2186240 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int64! Caching allocator allocated memory was 2186752 and is now reported as 2214912 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int8! Caching allocator allocated memory was 2214912 and is now reported as 2243072 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_uint8! Caching allocator allocated memory was 2243072 and is now reported as 2271232 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_abs_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_allclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3737: UserWarning: Casting complex values to real discards the imaginary part (Triggered internally at  ../aten/src/ATen/native/Copy.cpp:239.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bincount_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int16! Caching allocator allocated memory was 2289664 and is now reported as 2290688 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int32! Caching allocator allocated memory was 2290688 and is now reported as 2291712 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int64! Caching allocator allocated memory was 2291712 and is now reported as 2292736 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int8! Caching allocator allocated memory was 2292736 and is now reported as 2293760 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_uint8! Caching allocator allocated memory was 2293760 and is now reported as 2294784 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bitwise_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cdist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cdist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_chunk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3737: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at  ../aten/src/ATen/native/TensorShape.cpp:2246.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_combinations_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_complex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_complex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cov_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_bfloat16! Caching allocator allocated memory was 2294784 and is now reported as 2298880 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex128! Caching allocator allocated memory was 2298880 and is now reported as 2302976 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex64! Caching allocator allocated memory was 2302976 and is now reported as 2307072 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float16! Caching allocator allocated memory was 2307072 and is now reported as 2311168 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float32! Caching allocator allocated memory was 2311168 and is now reported as 2315264 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float64! Caching allocator allocated memory was 2315264 and is now reported as 2319360 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int16! Caching allocator allocated memory was 2319360 and is now reported as 2323456 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int32! Caching allocator allocated memory was 2323456 and is now reported as 2327552 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int64! Caching allocator allocated memory was 2327552 and is now reported as 2331648 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int8! Caching allocator allocated memory was 2331648 and is now reported as 2335744 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_uint8! Caching allocator allocated memory was 2335744 and is now reported as 2339840 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3737: UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /data/home/dberard/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at  ../aten/src/ATen/native/cuda/jit_utils.cpp:844.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_digamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_einsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_empty_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_eq_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3737: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:607.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_floor_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frac_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_geqrf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_gradient_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_imag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_imag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_istft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex128! Caching allocator allocated memory was 2339840 and is now reported as 2340352 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex64! Caching allocator allocated memory was 2340352 and is now reported as 2340864 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:770: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True.  (Triggered internally at  ../aten/src/ATen/native/SpectralOps.cpp:950.)
  return _VF.istft(input, n_fft, hop_length, win_length, window, center,  # type: ignore[attr-defined]
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float32! Caching allocator allocated memory was 2340864 and is now reported as 2341376 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float64! Caching allocator allocated memory was 2341376 and is now reported as 2341888 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_kron_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_det_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_householder_product_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lstsq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lu_factor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_pinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex128! Caching allocator allocated memory was 2341888 and is now reported as 2345984 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex64! Caching allocator allocated memory was 2345984 and is now reported as 2350080 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float32! Caching allocator allocated memory was 2350080 and is now reported as 2354176 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float64! Caching allocator allocated memory was 2354176 and is now reported as 2358272 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_vector_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_log10_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_logdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_logical_and_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_unpack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_max_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_median_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_min_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanquantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanquantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_empty_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_full_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_batch_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2363: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if size_prods == 1:
ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_bilinear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3737: UserWarning: Using padding='same' with even kernel lengths and odd dilation may require a zero-padded copy of the input be created (Triggered internally at  ../aten/src/ATen/native/Convolution.cpp:744.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... FAIL
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16! Caching allocator allocated memory was 2378752 and is now reported as 2380288 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16! Caching allocator allocated memory was 2380288 and is now reported as 2381824 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32! Caching allocator allocated memory was 2381824 and is now reported as 2383360 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64! Caching allocator allocated memory was 2383360 and is now reported as 2384896 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2756: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if var.size() != input.size():
/fsx/users/dberard/pytorch/torch/nn/functional.py:2780: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if torch.any(var < 0):
/fsx/users/dberard/pytorch/torch/nn/functional.py:2768: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  elif input.size()[:-1] == var.size()[:-1] and var.size(-1) == 1:  # Heteroscedastic case
/fsx/users/dberard/pytorch/torch/nn/functional.py:2762: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if input.size()[:-1] == var.size():
ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2475: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
  _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16! Caching allocator allocated memory was 2384896 and is now reported as 2388992 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float16! Caching allocator allocated memory was 2388992 and is now reported as 2393088 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float32! Caching allocator allocated memory was 2393088 and is now reported as 2397184 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float64! Caching allocator allocated memory was 2397184 and is now reported as 2401280 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3170: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if not (target.size() == input.size()):
ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_instance_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2408: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if size_prods == 1:
ERROR
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64! Caching allocator allocated memory was 2420224 and is now reported as 2435072 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3826: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
  warnings.warn(
/fsx/users/dberard/pytorch/torch/nn/functional.py:3848: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=linear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=trilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2863: UserWarning: reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_layer_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:682: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:651: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:780: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:749: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:878: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:847: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3228: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if not (target.size() == input.size()):
ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16! Caching allocator allocated memory was 2441216 and is now reported as 2455040 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16! Caching allocator allocated memory was 2455040 and is now reported as 2468864 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32! Caching allocator allocated memory was 2468864 and is now reported as 2482688 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64! Caching allocator allocated memory was 2482688 and is now reported as 2496512 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_one_hot_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4746: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert padding[-(idx * 2 + 1)] <= size, "Padding value causes wrapping around more than once."
/fsx/users/dberard/pytorch/torch/nn/functional.py:4747: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert padding[-(idx * 2 + 2)] <= size, "Padding value causes wrapping around more than once."
/fsx/users/dberard/pytorch/torch/nn/functional.py:4749: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert (
ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float16! Caching allocator allocated memory was 2496512 and is now reported as 2505728 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float32! Caching allocator allocated memory was 2505728 and is now reported as 2514944 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float64! Caching allocator allocated memory was 2514944 and is now reported as 2524160 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_relu6_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_silu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_threshold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_threshold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4008: UserWarning: nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.")
ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3953: UserWarning: nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.")
ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pca_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pca_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_pow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_quantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_quantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_interleave_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bfloat16! Caching allocator allocated memory was 2524160 and is now reported as 2524672 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bool! Caching allocator allocated memory was 2524672 and is now reported as 2525184 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex128! Caching allocator allocated memory was 2525184 and is now reported as 2525696 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex64! Caching allocator allocated memory was 2525696 and is now reported as 2526208 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float16! Caching allocator allocated memory was 2526208 and is now reported as 2526720 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float32! Caching allocator allocated memory was 2526720 and is now reported as 2527232 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float64! Caching allocator allocated memory was 2527232 and is now reported as 2527744 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int16! Caching allocator allocated memory was 2527744 and is now reported as 2528256 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int32! Caching allocator allocated memory was 2528256 and is now reported as 2528768 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int64! Caching allocator allocated memory was 2528768 and is now reported as 2529280 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int8! Caching allocator allocated memory was 2529280 and is now reported as 2529792 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_uint8! Caching allocator allocated memory was 2529792 and is now reported as 2530304 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_reshape_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_reduce_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_searchsorted_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float16! Caching allocator allocated memory was 2530304 and is now reported as 2608128 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float32! Caching allocator allocated memory was 2608128 and is now reported as 2685952 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float64! Caching allocator allocated memory was 2685952 and is now reported as 2763776 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int16! Caching allocator allocated memory was 2763776 and is now reported as 2841600 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int32! Caching allocator allocated memory was 2841600 and is now reported as 2919424 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int64! Caching allocator allocated memory was 2919424 and is now reported as 2997248 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int8! Caching allocator allocated memory was 2997248 and is now reported as 3075072 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_uint8! Caching allocator allocated memory was 3075072 and is now reported as 3152896 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_sort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex128! Caching allocator allocated memory was 3152896 and is now reported as 3153408 on device 0. CUDA driver allocated memory was 2696937472 and is now 2696937472.
  warnings.warn(msg)
ok
test_nvfuser_correctness_stft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex64! Caching allocator allocated memory was 3153408 and is now reported as 3153920 on device 0. CUDA driver allocated memory was 2696937472 and is now 2696937472.
  warnings.warn(msg)
ok
test_nvfuser_correctness_stft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:695: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at  ../aten/src/ATen/native/SpectralOps.cpp:798.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
ERROR
test_nvfuser_correctness_stft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_symeig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_t_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... [W pybind_utils.cpp:39] Warning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (function operator())
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:424: UserWarning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (Triggered internally at  ../torch/csrc/jit/python/pybind_utils.h:691.)
  return callable(*args, **kwargs)
ok
test_nvfuser_correctness_to_sparse_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triangular_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_tril_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_view_as_complex_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [0,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [1,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [2,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [3,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [4,0,0] Assertion `false` failed.
ERROR

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bfloat16! Caching allocator allocated memory was 512 and is now reported as 35328 on device 0. CUDA driver allocated memory was 1369440256 and is now 1371537408.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bool! Caching allocator allocated memory was 35328 and is now reported as 70144 on device 0. CUDA driver allocated memory was 1371537408 and is now 1373634560.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex128! Caching allocator allocated memory was 70144 and is now reported as 104960 on device 0. CUDA driver allocated memory was 1373634560 and is now 1375731712.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex64! Caching allocator allocated memory was 104960 and is now reported as 139776 on device 0. CUDA driver allocated memory was 1375731712 and is now 1377828864.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float16! Caching allocator allocated memory was 139776 and is now reported as 174592 on device 0. CUDA driver allocated memory was 1377828864 and is now 1379926016.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float32! Caching allocator allocated memory was 174592 and is now reported as 209408 on device 0. CUDA driver allocated memory was 1379926016 and is now 1382023168.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float64! Caching allocator allocated memory was 209408 and is now reported as 244224 on device 0. CUDA driver allocated memory was 1382023168 and is now 1384120320.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int16! Caching allocator allocated memory was 244224 and is now reported as 279040 on device 0. CUDA driver allocated memory was 1384120320 and is now 1386217472.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int32! Caching allocator allocated memory was 279040 and is now reported as 313856 on device 0. CUDA driver allocated memory was 1386217472 and is now 1388314624.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int64! Caching allocator allocated memory was 313856 and is now reported as 348672 on device 0. CUDA driver allocated memory was 1388314624 and is now 1390411776.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int8! Caching allocator allocated memory was 348672 and is now reported as 383488 on device 0. CUDA driver allocated memory was 1390411776 and is now 1392508928.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_uint8! Caching allocator allocated memory was 383488 and is now reported as 418304 on device 0. CUDA driver allocated memory was 1392508928 and is now 1394606080.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bfloat16! Caching allocator allocated memory was 418304 and is now reported as 422400 on device 0. CUDA driver allocated memory was 1675624448 and is now 1677721600.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bool! Caching allocator allocated memory was 422400 and is now reported as 426496 on device 0. CUDA driver allocated memory was 1677721600 and is now 1679818752.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex128! Caching allocator allocated memory was 426496 and is now reported as 430592 on device 0. CUDA driver allocated memory was 1679818752 and is now 1681915904.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex64! Caching allocator allocated memory was 430592 and is now reported as 434688 on device 0. CUDA driver allocated memory was 1681915904 and is now 1684013056.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float16! Caching allocator allocated memory was 434688 and is now reported as 438784 on device 0. CUDA driver allocated memory was 1684013056 and is now 1686110208.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float32! Caching allocator allocated memory was 438784 and is now reported as 442880 on device 0. CUDA driver allocated memory was 1686110208 and is now 1688207360.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float64! Caching allocator allocated memory was 442880 and is now reported as 446976 on device 0. CUDA driver allocated memory was 1688207360 and is now 1690304512.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int16! Caching allocator allocated memory was 446976 and is now reported as 451072 on device 0. CUDA driver allocated memory was 1690304512 and is now 1692401664.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int32! Caching allocator allocated memory was 451072 and is now reported as 455168 on device 0. CUDA driver allocated memory was 1692401664 and is now 1694498816.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int64! Caching allocator allocated memory was 455168 and is now reported as 459264 on device 0. CUDA driver allocated memory was 1694498816 and is now 1696595968.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int8! Caching allocator allocated memory was 459264 and is now reported as 463360 on device 0. CUDA driver allocated memory was 1696595968 and is now 1698693120.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_uint8! Caching allocator allocated memory was 463360 and is now reported as 467456 on device 0. CUDA driver allocated memory was 1698693120 and is now 1700790272.

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness___rsub___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: type inference failed, unrecognized operation encountered:aten::rsub

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: int

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":972, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
  __device__ T& operator[](int) {
    return data;
  };

  T data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

float pow(float a, int b) {
  return pow(a, (float)b);
}

double pow(double a, int b) {
  return pow(a, (double)b);
}

float pow(float a, int64_t b) {
  return pow(a, (float)b);
}

double pow(double a, int64_t b) {
  return pow(a, (double)b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      // __nanosleep only available on compute capability 7.0 or higher
      __nanosleep(200); // avoids busy waiting
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const volatile T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T b_avg,
    const T b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const volatile T* in_avg,
    const volatile T* in_M2,
    const volatile TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        work_buf_avg,
        work_buf_M2,
        work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel115(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int, 0> T3) {
  T3[0]
     = where(T0[0], T1[0], T2[0]);
}
}

CUDA NVRTC compile error: default_program(1694): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, int, int)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int64! Caching allocator allocated memory was 693248 and is now reported as 721408 on device 0. CUDA driver allocated memory was 1700790272 and is now 1702887424.

======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int8! Caching allocator allocated memory was 721408 and is now reported as 749568 on device 0. CUDA driver allocated memory was 1702887424 and is now 1704984576.

======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_uint8! Caching allocator allocated memory was 749568 and is now reported as 777728 on device 0. CUDA driver allocated memory was 1704984576 and is now 1707081728.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16! Caching allocator allocated memory was 777728 and is now reported as 788480 on device 0. CUDA driver allocated memory was 1707081728 and is now 1709178880.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float16! Caching allocator allocated memory was 788480 and is now reported as 799232 on device 0. CUDA driver allocated memory was 1709178880 and is now 1711276032.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float32! Caching allocator allocated memory was 799232 and is now reported as 809984 on device 0. CUDA driver allocated memory was 1711276032 and is now 1713373184.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float64! Caching allocator allocated memory was 809984 and is now reported as 820736 on device 0. CUDA driver allocated memory was 1713373184 and is now 1715470336.

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":972, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
  __device__ T& operator[](int) {
    return data;
  };

  T data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

float pow(float a, int b) {
  return pow(a, (float)b);
}

double pow(double a, int b) {
  return pow(a, (double)b);
}

float pow(float a, int64_t b) {
  return pow(a, (float)b);
}

double pow(double a, int64_t b) {
  return pow(a, (double)b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      // __nanosleep only available on compute capability 7.0 or higher
      __nanosleep(200); // avoids busy waiting
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const volatile T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T b_avg,
    const T b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const volatile T* in_avg,
    const volatile T* in_M2,
    const volatile TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        work_buf_avg,
        work_buf_M2,
        work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel153(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<bool, 0> T6, Tensor<int64_t, 0> T5) {
  T6[0]
     = where(T0[0], T1[0], T2[0]);
  T5[0]
     = where(T0[0], T3[0], T4[0]);
}
}

CUDA NVRTC compile error: default_program(1694): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, __nv_bool, __nv_bool)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_mean_cuda_complex128! Caching allocator allocated memory was 820736 and is now reported as 888832 on device 0. CUDA driver allocated memory was 1715470336 and is now 1717567488.

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":972, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
  __device__ T& operator[](int) {
    return data;
  };

  T data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

float pow(float a, int b) {
  return pow(a, (float)b);
}

double pow(double a, int b) {
  return pow(a, (double)b);
}

float pow(float a, int64_t b) {
  return pow(a, (float)b);
}

double pow(double a, int64_t b) {
  return pow(a, (double)b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      // __nanosleep only available on compute capability 7.0 or higher
      __nanosleep(200); // avoids busy waiting
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const volatile T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T b_avg,
    const T b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const volatile T* in_avg,
    const volatile T* in_M2,
    const volatile TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        work_buf_avg,
        work_buf_M2,
        work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel158(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<int, 0> T6, Tensor<int64_t, 0> T5) {
  T6[0]
     = where(T0[0], T1[0], T2[0]);
  T5[0]
     = where(T0[0], T3[0], T4[0]);
}
}

CUDA NVRTC compile error: default_program(1694): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, int, int)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Error adding cache_after T2_g[ iS4{i5}, iS5{i6}, sbS6{1}, iS7{i7} ] we restrict using cache_after on an output.


======================================================================
ERROR: test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":972, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
  __device__ T& operator[](int) {
    return data;
  };

  T data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

float pow(float a, int b) {
  return pow(a, (float)b);
}

double pow(double a, int b) {
  return pow(a, (double)b);
}

float pow(float a, int64_t b) {
  return pow(a, (float)b);
}

double pow(double a, int64_t b) {
  return pow(a, (double)b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      // __nanosleep only available on compute capability 7.0 or higher
      __nanosleep(200); // avoids busy waiting
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const volatile T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T b_avg,
    const T b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const volatile T* in_avg,
    const volatile T* in_M2,
    const volatile TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        work_buf_avg,
        work_buf_M2,
        work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel196(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<bool, 0> T3) {
  T3[0]
     = where(T0[0], T1[0], T2[0]);
}
}

CUDA NVRTC compile error: default_program(1694): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, __nv_bool, __nv_bool)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid resource handle


======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":16, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid resource handle


======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid resource handle


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: view->kind() == prim::view_copy || view->kind() == prim::reshape_copyINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/graph_fuser.cpp":1613, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: view->kind() == prim::view_copy || view->kind() == prim::reshape_copyINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/graph_fuser.cpp":1613, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: bool

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: int

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: double to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: double to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3739, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":972, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
  __device__ T& operator[](int) {
    return data;
  };

  T data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

float pow(float a, int b) {
  return pow(a, (float)b);
}

double pow(double a, int b) {
  return pow(a, (double)b);
}

float pow(float a, int64_t b) {
  return pow(a, (float)b);
}

double pow(double a, int64_t b) {
  return pow(a, (double)b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple