davidberard98/nvfuser-opinfo.txt

## nvfuser-opinfo.txt
srun: job 21182 queued and waiting for resources
srun: job 21182 has been allocated resources
srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device
srun: error: Not using a pseudo-terminal, disregarding --pty option
monkeytype is not installed. Skipping tests for Profile-Directed Typing
test_nvfuser_correctness_H_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_H_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_T_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___radd___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:424: UserWarning: fast math disabled in nvfuser, try set `PYTORCH_NVFUSER_DISABLE_FASTMATH=0` (Triggered internally at  ../torch/csrc/jit/codegen/cuda/executor_utils.cpp:705.)
  return callable(*args, **kwargs)
ok
test_nvfuser_correctness___radd___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___radd___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___radd___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rand___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rdiv___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmatmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmod___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rmul___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rmul___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___ror___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_tensor.py:627: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(other, dtype=dtype, device=self.device) ** self
ERROR
test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness___rsub___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rsub___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness___rxor___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:333: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int16! Caching allocator allocated memory was 467456 and is now reported as 495616 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_int8! Caching allocator allocated memory was 495616 and is now reported as 523776 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amax_cuda_uint8! Caching allocator allocated memory was 523776 and is now reported as 551936 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:336: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_bfloat16! Caching allocator allocated memory was 551936 and is now reported as 580096 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_float16! Caching allocator allocated memory was 580096 and is now reported as 608256 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:338: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.iinfo(dtype).max, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int16! Caching allocator allocated memory was 608256 and is now reported as 636416 on device 0. CUDA driver allocated memory was 1700790272 and is now 1700790272.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:331: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(-torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:386: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  elif mask.shape != input.shape:
ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_mean_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/_masked/__init__.py:351: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(0, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/_masked/__init__.py:350: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(torch.inf, dtype=dtype, device=device)
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_bfloat16! Caching allocator allocated memory was 809472 and is now reported as 950272 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float16! Caching allocator allocated memory was 950272 and is now reported as 1091072 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float32! Caching allocator allocated memory was 1091072 and is now reported as 1231872 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_norm_cuda_float64! Caching allocator allocated memory was 1231872 and is now reported as 1372672 on device 0. CUDA driver allocated memory was 1713373184 and is now 1713373184.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness__masked_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_bfloat16! Caching allocator allocated memory was 1372672 and is now reported as 1400832 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex128! Caching allocator allocated memory was 1401344 and is now reported as 1429504 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_complex64! Caching allocator allocated memory was 1429504 and is now reported as 1457664 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float16! Caching allocator allocated memory was 1457664 and is now reported as 1485824 on device 0. CUDA driver allocated memory was 1715470336 and is now 1715470336.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float64! Caching allocator allocated memory was 1513984 and is now reported as 1542144 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int16! Caching allocator allocated memory was 1542144 and is now reported as 1570304 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int64! Caching allocator allocated memory was 1570816 and is now reported as 1598976 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_int8! Caching allocator allocated memory was 1598976 and is now reported as 1627136 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_uint8! Caching allocator allocated memory was 1627136 and is now reported as 1655296 on device 0. CUDA driver allocated memory was 1717567488 and is now 1717567488.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex128! Caching allocator allocated memory was 1737728 and is now reported as 1765888 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_complex64! Caching allocator allocated memory was 1765888 and is now reported as 1794048 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int16! Caching allocator allocated memory was 1794048 and is now reported as 1822208 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int64! Caching allocator allocated memory was 1822720 and is now reported as 1850880 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_int8! Caching allocator allocated memory was 1850880 and is now reported as 1879040 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_sum_cuda_uint8! Caching allocator allocated memory was 1879040 and is now reported as 1907200 on device 0. CUDA driver allocated memory was 1721761792 and is now 1721761792.
  warnings.warn(msg)
ok
test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_abs_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_abs_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_acos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_acosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcdiv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_addcmul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addcmul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmm_decomposed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addmv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_addr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_all_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_allclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_allclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_amax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_aminmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_angle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_any_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argsort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_argwhere_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_as_strided_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_asinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_1d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_2d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_atleast_3d_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_baddbmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bernoulli_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Casting complex values to real discards the imaginary part (Triggered internally at  ../aten/src/ATen/native/Copy.cpp:239.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bfloat16_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bincount_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int16! Caching allocator allocated memory was 1931776 and is now reported as 1932800 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int32! Caching allocator allocated memory was 1932800 and is now reported as 1933824 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int64! Caching allocator allocated memory was 1933824 and is now reported as 1934848 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_int8! Caching allocator allocated memory was 1934848 and is now reported as 1935872 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bincount_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_bincount_cuda_uint8! Caching allocator allocated memory was 1935872 and is now reported as 1936896 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_bitwise_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_left_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bitwise_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_right_shift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bitwise_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_block_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bmm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_bool_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bool_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_broadcast_to_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_bucketize_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_byte_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cartesian_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cdist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cdist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ceil_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_char_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_cholesky_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_chunk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_chunk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_clamp_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clamp_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_clone_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_column_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at  ../aten/src/ATen/native/TensorShape.cpp:2246.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_combinations_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_combinations_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_complex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_complex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_conj_physical_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_contiguous_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_copysign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_corrcoef_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cos_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cos_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_cosh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cosh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_count_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cov_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_bfloat16! Caching allocator allocated memory was 1936896 and is now reported as 1940992 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex128! Caching allocator allocated memory was 1940992 and is now reported as 1945088 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_complex64! Caching allocator allocated memory was 1945088 and is now reported as 1949184 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float16! Caching allocator allocated memory was 1949184 and is now reported as 1953280 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float32! Caching allocator allocated memory was 1953280 and is now reported as 1957376 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_float64! Caching allocator allocated memory was 1957376 and is now reported as 1961472 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int16! Caching allocator allocated memory was 1961472 and is now reported as 1965568 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int32! Caching allocator allocated memory was 1965568 and is now reported as 1969664 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int64! Caching allocator allocated memory was 1969664 and is now reported as 1973760 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_int8! Caching allocator allocated memory was 1973760 and is now reported as 1977856 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cov_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_cov_cuda_uint8! Caching allocator allocated memory was 1977856 and is now reported as 1981952 on device 0. CUDA driver allocated memory was 1723858944 and is now 1723858944.
  warnings.warn(msg)
ok
test_nvfuser_correctness_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cummin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumprod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumsum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_cumulative_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_deg2rad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diag_embed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagflat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diagonal_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_diff_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /data/home/dberard/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at  ../aten/src/ATen/native/cuda/jit_utils.cpp:844.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_digamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_digamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dist_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_floor_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_floor_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_no_rounding_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_no_rounding_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_div_trunc_rounding_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_div_trunc_rounding_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_double_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_double_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_double_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_dstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_einsum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_einsum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_empty_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_empty_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_eq_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_eq_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_eq_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_erfc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_erfinv_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_exp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_exp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expand_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_expm1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_fftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_hfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ifftshift_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_ihfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_irfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfft_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fft_rfftn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fill__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flatten_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flip_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fliplr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_flipud_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_float_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_float_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_float_power_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:607.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_floor_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_floor_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmax_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_fmod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frac_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_frexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_frexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_full_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gather_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gcd_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ge_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ge_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_geqrf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_geqrf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_gradient_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gradient_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_gt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_gt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_gt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_half_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_heaviside_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_histc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hsplit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hstack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_hypot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_i0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igamma_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_grad_other_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_igammac_grad_other_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_imag_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_imag_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_copy_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_index_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inner_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_int_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_int_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_int_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_int_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_int_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_inverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isclose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isfinite_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isnan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isneginf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isposinf_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_isreal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_istft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex128! Caching allocator allocated memory was 1981952 and is now reported as 1982464 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_complex64! Caching allocator allocated memory was 1982464 and is now reported as 1982976 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:770: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True.  (Triggered internally at  ../aten/src/ATen/native/SpectralOps.cpp:950.)
  return _VF.istft(input, n_fft, hop_length, win_length, window, center,  # type: ignore[attr-defined]
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float32! Caching allocator allocated memory was 1982976 and is now reported as 1983488 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_istft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_istft_cuda_float64! Caching allocator allocated memory was 1983488 and is now reported as 1984000 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_kron_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kron_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_kthvalue_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lcm_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ldexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_le_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_le_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lerp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lgamma_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lgamma_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cholesky_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cond_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_cross_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_det_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_det_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_eigvalsh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_householder_product_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_householder_product_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_inv_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lstsq_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lstsq_grad_oriented_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_lu_factor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_lu_factor_ex_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_power_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_matrix_rank_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_multi_dot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_pinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex128! Caching allocator allocated memory was 1984000 and is now reported as 1988096 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_complex64! Caching allocator allocated memory was 1988096 and is now reported as 1992192 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float32! Caching allocator allocated memory was 1992192 and is now reported as 1996288 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_linalg_pinv_cuda_float64! Caching allocator allocated memory was 1996288 and is now reported as 2000384 on device 0. CUDA driver allocated memory was 2168455168 and is now 2168455168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_hermitian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_pinv_singular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_nvfuser_correctness_linalg_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_slogdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_solve_triangular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_svdvals_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorinv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_tensorsolve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_linalg_vector_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_linalg_vector_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_log10_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log10_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log10_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log1p_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_log_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_log_softmax_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logaddexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logcumsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logdet_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_logdet_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_logical_and_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_and_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_not_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_or_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logical_xor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_logsumexp_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_long_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_long_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_long_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_lt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_lu_unpack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_lu_unpack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mH_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mT_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_fill_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_masked_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matmul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_matrix_exp_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_max_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_max_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_maximum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_median_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_median_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_list_of_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_meshgrid_variadic_tensors_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_min_binary_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_binary_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_no_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_min_reduction_with_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_minimum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mode_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_movedim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_msort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_mul_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mul_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_multinomial_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mv_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_mvlgamma_mvlgamma_p_5_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nan_to_num_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanmedian_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanquantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nanquantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nansum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_narrow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_ne_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ne_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_empty_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_empty_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Skipped!'
test_nvfuser_correctness_new_full_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_full_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_ones_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_new_zeros_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nextafter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_adaptive_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_avg_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_batch_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2363: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if size_prods == 1:
ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_batch_norm_without_cudnn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_bilinear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_celu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py:3230: UserWarning: Using padding='same' with even kernel lengths and odd dilation may require a zero-padded copy of the input be created (Triggered internally at  ../aten/src/ATen/native/Convolution.cpp:744.)
  ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... FAIL
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_conv_transpose3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_embedding_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cosine_similarity_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_bfloat16! Caching allocator allocated memory was 2000384 and is now reported as 2001920 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float16! Caching allocator allocated memory was 2001920 and is now reported as 2003456 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float32! Caching allocator allocated memory was 2003456 and is now reported as 2004992 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_cross_entropy_cuda_float64! Caching allocator allocated memory was 2004992 and is now reported as 2006528 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_ctc_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_dropout_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_elu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_bag_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_embedding_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_with_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_feature_alpha_dropout_without_train_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_fractional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2756: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if var.size() != input.size():
/fsx/users/dberard/pytorch/torch/nn/functional.py:2780: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if torch.any(var < 0):
ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gaussian_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_gelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_gelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_glu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_grid_sample_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2475: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
  _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_bfloat16! Caching allocator allocated memory was 2006528 and is now reported as 2010624 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float16! Caching allocator allocated memory was 2010624 and is now reported as 2014720 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float32! Caching allocator allocated memory was 2014720 and is now reported as 2018816 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_group_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_group_norm_cuda_float64! Caching allocator allocated memory was 2018816 and is now reported as 2022912 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardswish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hardtanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_hinge_embedding_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3170: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if not (target.size() == input.size()):
ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_huber_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_instance_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2408: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if size_prods == 1:
ERROR
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_instance_norm_cuda_float32! Caching allocator allocated memory was 2027008 and is now reported as 2041856 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_instance_norm_cuda_float64! Caching allocator allocated memory was 2041856 and is now reported as 2056704 on device 0. CUDA driver allocated memory was 2684354560 and is now 2684354560.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3826: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
  warnings.warn(
/fsx/users/dberard/pytorch/torch/nn/functional.py:3848: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_area_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=linear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3771: UserWarning: Default upsampling behavior when mode=trilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  warnings.warn(
ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_kl_div_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:2863: UserWarning: reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release.
  warnings.warn(
ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_kl_div_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_layer_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_leaky_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_linear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_linear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_local_response_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_logsigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:682: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:651: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool1d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool1d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:780: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:749: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool2d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:878: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
/fsx/users/dberard/pytorch/torch/nn/functional.py:847: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool3d in a future release.
  warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_max_pool3d_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mish_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3228: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if not (target.size() == input.size()):
ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_mse_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_bfloat16! Caching allocator allocated memory was 2056704 and is now reported as 2070528 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float16! Caching allocator allocated memory was 2070528 and is now reported as 2084352 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float32! Caching allocator allocated memory was 2084352 and is now reported as 2098176 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_nll_loss_cuda_float64! Caching allocator allocated memory was 2098176 and is now reported as 2112000 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_normalize_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_normalize_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_one_hot_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4746: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert padding[-(idx * 2 + 1)] <= size, "Padding value causes wrapping around more than once."
/fsx/users/dberard/pytorch/torch/nn/functional.py:4747: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert padding[-(idx * 2 + 2)] <= size, "Padding value causes wrapping around more than once."
/fsx/users/dberard/pytorch/torch/nn/functional.py:4749: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert (
ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_circular_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_constant_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_reflect_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pad_replicate_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pairwise_distance_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_shuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_pixel_unshuffle_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_poisson_nll_loss_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float16! Caching allocator allocated memory was 2112000 and is now reported as 2121216 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float32! Caching allocator allocated memory was 2121216 and is now reported as 2130432 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_prelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_nn_functional_prelu_cuda_float64! Caching allocator allocated memory was 2130432 and is now reported as 2139648 on device 0. CUDA driver allocated memory was 2686451712 and is now 2686451712.
  warnings.warn(msg)
ok
test_nvfuser_correctness_nn_functional_relu6_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu6_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_relu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_rrelu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_selu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_silu_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_silu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softmin_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softplus_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_softsign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_softsign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_tanhshrink_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_threshold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_nn_functional_threshold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_threshold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:4008: UserWarning: nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.")
ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_bilinear_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/nn/functional.py:3953: UserWarning: nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.")
ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nn_functional_upsample_nearest_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_nonzero_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_fro_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_inf_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_norm_nuc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_normal_number_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ones_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ormqr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_outer_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pca_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pca_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_permute_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pinverse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_2_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_3_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_polygamma_polygamma_n_4_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_positive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_pow_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_pow_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_pow_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_prod_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_put_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_qr_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_qr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_quantile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_quantile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rad2deg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rand_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randint_like_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_randn_like_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_ravel_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_real_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_reciprocal_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reciprocal_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_autodiffed_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_remainder_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_renorm_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_repeat_interleave_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bfloat16! Caching allocator allocated memory was 2139648 and is now reported as 2140160 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_bool! Caching allocator allocated memory was 2140160 and is now reported as 2140672 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex128! Caching allocator allocated memory was 2140672 and is now reported as 2141184 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_complex64! Caching allocator allocated memory was 2141184 and is now reported as 2141696 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float16! Caching allocator allocated memory was 2141696 and is now reported as 2142208 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float32! Caching allocator allocated memory was 2142208 and is now reported as 2142720 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_float64! Caching allocator allocated memory was 2142720 and is now reported as 2143232 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int16! Caching allocator allocated memory was 2143232 and is now reported as 2143744 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int32! Caching allocator allocated memory was 2143744 and is now reported as 2144256 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int64! Caching allocator allocated memory was 2144256 and is now reported as 2144768 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_int8! Caching allocator allocated memory was 2144768 and is now reported as 2145280 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_repeat_interleave_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_repeat_interleave_cuda_uint8! Caching allocator allocated memory was 2145280 and is now reported as 2145792 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_reshape_as_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_as_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_reshape_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resize_as__cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_conj_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_resolve_neg_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_roll_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rot90_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_round_decimals_neg_3_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_scalar_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_rsub_rsub_tensor_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_add_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_scatter_reduce_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_scatter_reduce_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'Only runs on cpu'
test_nvfuser_correctness_searchsorted_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float16! Caching allocator allocated memory was 2145792 and is now reported as 2223616 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float32! Caching allocator allocated memory was 2223616 and is now reported as 2301440 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_float64! Caching allocator allocated memory was 2301440 and is now reported as 2379264 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int16! Caching allocator allocated memory was 2379264 and is now reported as 2457088 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int32! Caching allocator allocated memory was 2457088 and is now reported as 2534912 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int64! Caching allocator allocated memory was 2534912 and is now reported as 2612736 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_int8! Caching allocator allocated memory was 2612736 and is now reported as 2690560 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_searchsorted_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_searchsorted_cuda_uint8! Caching allocator allocated memory was 2690560 and is now reported as 2768384 on device 0. CUDA driver allocated memory was 2688548864 and is now 2688548864.
  warnings.warn(msg)
ok
test_nvfuser_correctness_select_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_select_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sgn_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_channels_last_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_short_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sigmoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sigmoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sign_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_signbit_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinc_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sinh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_slice_scatter_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_softmax_with_dtype_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_softmax_with_dtype_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_sort_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sort_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_entr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_erfcx_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i0e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_i1e_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtr_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_ndtri_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_polygamma_special_polygamma_n_0_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_xlog1py_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_special_zeta_grad_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_list_args_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_split_with_sizes_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sqrt_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_square_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_squeeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stack_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_std_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_stft_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex128! Caching allocator allocated memory was 2768384 and is now reported as 2768896 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_stft_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_complex64! Caching allocator allocated memory was 2768896 and is now reported as 2769408 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_stft_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/functional.py:695: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at  ../aten/src/ATen/native/SpectralOps.cpp:798.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_float32! Caching allocator allocated memory was 2769408 and is now reported as 2769920 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_stft_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... /fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:1317: UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness_stft_cuda_float64! Caching allocator allocated memory was 2769920 and is now reported as 2770432 on device 0. CUDA driver allocated memory was 2692743168 and is now 2692743168.
  warnings.warn(msg)
ok
test_nvfuser_correctness_sub_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sub_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sub_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_sum_to_size_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_sum_to_size_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_svd_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_lowrank_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_svd_lowrank_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_symeig_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_symeig_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_t_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_t_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_along_dim_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_take_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tan_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tanh_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensor_split_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tensordot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tile_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... [W pybind_utils.cpp:39] Warning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (function operator())
/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py:424: UserWarning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (Triggered internally at  ../torch/csrc/jit/python/pybind_utils.h:691.)
  return callable(*args, **kwargs)
ok
test_nvfuser_correctness_to_sparse_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_to_sparse_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_topk_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trace_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_transpose_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapezoid_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trapz_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triangular_solve_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_triangular_solve_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... skipped 'no MAGMA library detected'
test_nvfuser_correctness_tril_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_tril_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_triu_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ERROR
test_nvfuser_correctness_true_divide_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_true_divide_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_trunc_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unfold_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_consecutive_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unique_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_bool (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_unsqueeze_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_var_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_vdot_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA) ... ok
test_nvfuser_correctness_view_as_complex_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA) ... ../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [0,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [1,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [2,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [3,0,0] Assertion `false` failed.
../c10/util/TypeCast.h:135: fetch_and_cast: block: [0,0,0], thread: [4,0,0] Assertion `false` failed.
ERROR

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bfloat16! Caching allocator allocated memory was 512 and is now reported as 35328 on device 0. CUDA driver allocated memory was 1369440256 and is now 1371537408.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_bool! Caching allocator allocated memory was 35328 and is now reported as 70144 on device 0. CUDA driver allocated memory was 1371537408 and is now 1373634560.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex128! Caching allocator allocated memory was 70144 and is now reported as 104960 on device 0. CUDA driver allocated memory was 1373634560 and is now 1375731712.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_complex64! Caching allocator allocated memory was 104960 and is now reported as 139776 on device 0. CUDA driver allocated memory was 1375731712 and is now 1377828864.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float16! Caching allocator allocated memory was 139776 and is now reported as 174592 on device 0. CUDA driver allocated memory was 1377828864 and is now 1379926016.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float32! Caching allocator allocated memory was 174592 and is now reported as 209408 on device 0. CUDA driver allocated memory was 1379926016 and is now 1382023168.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_float64! Caching allocator allocated memory was 209408 and is now reported as 244224 on device 0. CUDA driver allocated memory was 1382023168 and is now 1384120320.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int16! Caching allocator allocated memory was 244224 and is now reported as 279040 on device 0. CUDA driver allocated memory was 1384120320 and is now 1386217472.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int32! Caching allocator allocated memory was 279040 and is now reported as 313856 on device 0. CUDA driver allocated memory was 1386217472 and is now 1388314624.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int64! Caching allocator allocated memory was 313856 and is now reported as 348672 on device 0. CUDA driver allocated memory was 1388314624 and is now 1390411776.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_int8! Caching allocator allocated memory was 348672 and is now reported as 383488 on device 0. CUDA driver allocated memory was 1390411776 and is now 1392508928.

======================================================================
ERROR: test_nvfuser_correctness___getitem___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___getitem___cuda_uint8! Caching allocator allocated memory was 383488 and is now reported as 418304 on device 0. CUDA driver allocated memory was 1392508928 and is now 1394606080.

======================================================================
ERROR: test_nvfuser_correctness___radd___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: int

======================================================================
ERROR: test_nvfuser_correctness___rmul___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: int

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bfloat16! Caching allocator allocated memory was 418304 and is now reported as 422400 on device 0. CUDA driver allocated memory was 1675624448 and is now 1677721600.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_bool! Caching allocator allocated memory was 422400 and is now reported as 426496 on device 0. CUDA driver allocated memory was 1677721600 and is now 1679818752.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex128! Caching allocator allocated memory was 426496 and is now reported as 430592 on device 0. CUDA driver allocated memory was 1679818752 and is now 1681915904.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_complex64! Caching allocator allocated memory was 430592 and is now reported as 434688 on device 0. CUDA driver allocated memory was 1681915904 and is now 1684013056.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float16! Caching allocator allocated memory was 434688 and is now reported as 438784 on device 0. CUDA driver allocated memory was 1684013056 and is now 1686110208.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float32! Caching allocator allocated memory was 438784 and is now reported as 442880 on device 0. CUDA driver allocated memory was 1686110208 and is now 1688207360.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_float64! Caching allocator allocated memory was 442880 and is now reported as 446976 on device 0. CUDA driver allocated memory was 1688207360 and is now 1690304512.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int16! Caching allocator allocated memory was 446976 and is now reported as 451072 on device 0. CUDA driver allocated memory was 1690304512 and is now 1692401664.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int32! Caching allocator allocated memory was 451072 and is now reported as 455168 on device 0. CUDA driver allocated memory was 1692401664 and is now 1694498816.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int64! Caching allocator allocated memory was 455168 and is now reported as 459264 on device 0. CUDA driver allocated memory was 1694498816 and is now 1696595968.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_int8! Caching allocator allocated memory was 459264 and is now reported as 463360 on device 0. CUDA driver allocated memory was 1696595968 and is now 1698693120.

======================================================================
ERROR: test_nvfuser_correctness___rpow___cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness___rpow___cuda_uint8! Caching allocator allocated memory was 463360 and is now reported as 467456 on device 0. CUDA driver allocated memory was 1698693120 and is now 1700790272.

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: int

======================================================================
ERROR: test_nvfuser_correctness__masked_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_int8! Caching allocator allocated memory was 636416 and is now reported as 664576 on device 0. CUDA driver allocated memory was 1700790272 and is now 1702887424.

======================================================================
ERROR: test_nvfuser_correctness__masked_amin_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_amin_cuda_uint8! Caching allocator allocated memory was 664576 and is now reported as 692736 on device 0. CUDA driver allocated memory was 1702887424 and is now 1704984576.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_bfloat16! Caching allocator allocated memory was 692736 and is now reported as 703488 on device 0. CUDA driver allocated memory was 1704984576 and is now 1707081728.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float16! Caching allocator allocated memory was 703488 and is now reported as 714240 on device 0. CUDA driver allocated memory was 1707081728 and is now 1709178880.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float32! Caching allocator allocated memory was 714240 and is now reported as 724992 on device 0. CUDA driver allocated memory was 1709178880 and is now 1711276032.

======================================================================
ERROR: test_nvfuser_correctness__masked_log_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_log_softmax_cuda_float64! Caching allocator allocated memory was 724992 and is now reported as 735744 on device 0. CUDA driver allocated memory was 1711276032 and is now 1713373184.

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      __nanosleep(200);
#else
      // __nanosleep is not available for sm < 70
      assert(false);
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T& b_avg,
    const T& b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T* in_avg,
    const T* in_M2,
    const TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        (T*)work_buf_avg,
        (T*)work_buf_M2,
        (TN*)work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel127(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<bool, 0> T6, Tensor<int64_t, 0> T5) {
  T6[0]
     = where(T0[0], T1[0], T2[0]);
  T5[0]
     = where(T0[0], T3[0], T4[0]);
}
}

CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, __nv_bool, __nv_bool)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      __nanosleep(200);
#else
      // __nanosleep is not available for sm < 70
      assert(false);
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T& b_avg,
    const T& b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T* in_avg,
    const T* in_M2,
    const TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        (T*)work_buf_avg,
        (T*)work_buf_M2,
        (TN*)work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel133(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int64_t, 0> T3, Tensor<int64_t, 0> T4, Tensor<int, 0> T6, Tensor<int64_t, 0> T5) {
  T6[0]
     = where(T0[0], T1[0], T2[0]);
  T5[0]
     = where(T0[0], T3[0], T4[0]);
}
}

CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, int, int)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Error adding cache_after T2_g[ iS4{i13}, iS5{i16}, sbS6{1}, iS7{i22} ] we restrict using cache_after on an output.


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_mean_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_prod_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      __nanosleep(200);
#else
      // __nanosleep is not available for sm < 70
      assert(false);
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T& b_avg,
    const T& b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T* in_avg,
    const T* in_M2,
    const TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        (T*)work_buf_avg,
        (T*)work_buf_M2,
        (TN*)work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel187(Tensor<bool, 0> T0, Tensor<bool, 0> T1, Tensor<bool, 0> T2, Tensor<bool, 0> T3) {
  T3[0]
     = where(T0[0], T1[0], T2[0]);
}
}

CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, __nv_bool, __nv_bool)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_prod_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1337, in __exit__
    raise RuntimeError(msg)
RuntimeError: CUDA driver API confirmed a leak in __main__.TestCudaFuserOpInfoCUDA.test_nvfuser_correctness__masked_prod_cuda_float32! Caching allocator allocated memory was 1485824 and is now reported as 1513984 on device 0. CUDA driver allocated memory was 1715470336 and is now 1717567488.

======================================================================
ERROR: test_nvfuser_correctness__masked_prod_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: falseINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/executor_utils.cpp":828, please report a bug to PyTorch. namespace CudaCodeGen {

typedef unsigned char uint8_t;
typedef signed char int8_t;
typedef short int int16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
typedef int nvfuser_index_t;


#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __half;
__device__ __half __float2half(const float);

struct __align__(2) __half {
  __half() = default;

  __device__ __half(const float f) {
    __x = __float2half(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __half __float2half(const float f) {
  __half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_HALF_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __half2float(const __half h) {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
  return val;
}

// aligned vector generates vectorized load/store on CUDA
template <typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) Array {
  scalar_t val[vec_size];
  __device__ void set(scalar_t v) {
    for (int i = 0; i < vec_size; ++i) {
      val[i] = v;
    }
  }
};


#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
  *(reinterpret_cast<const unsigned short*>(&(var)))

struct __bfloat;
__device__ __bfloat __float2bfloat(const float);

struct __align__(2) __bfloat {
  __bfloat() = default;

  __device__ __bfloat(const float f) {
    __x = __float2bfloat(f).__x;
  }

 protected:
  unsigned short __x;
};

__device__ __bfloat __float2bfloat(const float f) {
  __bfloat val;
  asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
      : "=h"(__NVFUSER_BFLOAT_TO_US(val))
      : "f"(f));
  return val;
}

__device__ float __bfloat2float(const __bfloat h) {
  float val;
  asm("{  mov.b32 %0, {0,%1};}\n"
      : "=f"(val)
      : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
  return val;
}

template <typename T, int N>
struct Tensor {
  __device__ T& operator[](nvfuser_index_t ind) {
    return data[ind];
  };

  T* data;
  nvfuser_index_t size[N];
  nvfuser_index_t stride[N];
};

// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
  __device__ T& operator[](nvfuser_index_t) {
    return *data;
  };

  T* data;
};

class Philox {
 public:
  __device__ Philox(
      unsigned long long seed,
      unsigned long long subsequence,
      unsigned long long offset) {
    key.x = (unsigned int)seed;
    key.y = (unsigned int)(seed >> 32);
    counter = make_uint4(0, 0, 0, 0);
    counter.z = (unsigned int)(subsequence);
    counter.w = (unsigned int)(subsequence >> 32);
    STATE = 0;
    incr_n(offset / 4);
  }

  __device__ unsigned long operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      for (int i = 0; i < 9; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    unsigned long ret = 0;
    switch (STATE) {
      case 0:
        ret = output.x;
        break;
      case 1:
        ret = output.y;
        break;
      case 2:
        ret = output.z;
        break;
      case 3:
        ret = output.w;
        break;
    }
    STATE = (STATE + 1) % 4;
    return ret;
  }

 private:
  __device__ void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo)
      nhi++;
    counter.y += nhi;
    if (nhi <= counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ void incr() {
    if (++counter.x)
      return;
    if (++counter.y)
      return;
    if (++counter.z)
      return;
    ++counter.w;
  }

  __device__ unsigned int mulhilo32(
      unsigned int a,
      unsigned int b,
      unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }

  __device__ uint4 single_round(uint4 ctr, uint2 key) {
    unsigned int hi0;
    unsigned int hi1;
    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    return ret;
  }

 private:
  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;

  uint4 counter = {};
  uint4 output = {};
  uint2 key = {};
  unsigned int STATE = 0;
};

__device__ float uniformf(unsigned int x) {
  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
  return x * kRanInvM32;
}

__device__ double uniform(unsigned int x, unsigned int y) {
  constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
  const unsigned long long z =
      (unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
  return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

#define NVFUSER_DEFINE_MAGIC_ZERO          \
  __shared__ int nvfuser_zero_s;           \
  if (threadIdx.x == 0)                    \
    nvfuser_zero_s = 0;                    \
  __syncthreads();                         \
  atomicMin(&nvfuser_zero_s, threadIdx.x); \
  int nvfuser_zero = nvfuser_zero_s;

#define NVFUSER_UPDATE_MAGIC_ZERO \
  do {                            \
    nvfuser_zero <<= 1;           \
  } while (0);

__device__ constexpr int ceilDiv(int a, int b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
  return (a + b - 1) / b;
}

__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
  return ceilDiv(a, (int64_t)b);
}

__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
  return ceilDiv((int64_t)a, b);
}

__device__ constexpr int max(int a, int b) {
  return ::max(a, b);
}

__device__ constexpr int64_t max(int64_t a, int b) {
  return ::max(a, (int64_t)b);
}

__device__ constexpr int64_t max(int a, int64_t b) {
  return ::max((int64_t)a, b);
}

__device__ constexpr int64_t max(int64_t a, int64_t b) {
  return ::max(a, b);
}

__device__ double fmax(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ float fmax(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmax(a, b);
  }
}

__device__ constexpr int min(int a, int b) {
  return ::min(a, b);
}

__device__ constexpr int64_t min(int64_t a, int b) {
  return ::min(a, (int64_t)b);
}

__device__ constexpr int64_t min(int a, int64_t b) {
  return ::min((int64_t)a, b);
}

__device__ constexpr int64_t min(int64_t a, int64_t b) {
  return ::min(a, b);
}

__device__ double fmin(double a, double b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ float fmin(float a, float b) {
  // check and propagate NaN
  if (a != a) {
    return a;
  } else if (b != b) {
    return b;
  } else {
    return ::fmin(a, b);
  }
}

__device__ constexpr int alignBufferSize(int buffer, int size) {
  return (buffer + (size - 1)) & ~(size - 1);
}

__device__ double clamp(double x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ float clamp(float x, double minv, double maxv) {
  return x < minv ? minv : (x > maxv ? maxv : x);
}

__device__ double frac(double x) {
  return x - trunc(x);
}

__device__ float frac(float x) {
  return x - trunc(x);
}

__device__ double gelu(double x) {
  return x * normcdf(x);
}

__device__ float gelu(float x) {
  return x * normcdf(x);
}

__device__ double reciprocal(double x) {
  return 1 / x;
}

__device__ float reciprocal(float x) {
  return 1 / x;
}

__device__ double relu(double x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(float x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int64_t x) {
  return x <= 0 ? 0 : x;
}

__device__ float relu(int x) {
  return x <= 0 ? 0 : x;
}

__device__ double remainder(double a, double b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ float remainder(float a, float b) {
  auto mod = ::fmod(a, b);
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

__device__ float sigmoid(float x) {
  return 1 / (1 + exp(-x));
}

__device__ double silu(double x) {
  return x * sigmoid(x);
}

__device__ float silu(float x) {
  return x * sigmoid(x);
}

__device__ double threshold(double x, double t, double v) {
  return x <= t ? v : x;
}

__device__ float threshold(float x, double t, double v) {
  return x <= t ? v : x;
}

__device__ double where(bool c, double a, double b) {
  return c ? a : b;
}

__device__ float where(bool c, float a, float b) {
  return c ? a : b;
}

__device__ int64_t where(bool c, int64_t a, int64_t b) {
  return c ? a : b;
}

__device__ double randLike(Philox& rnd) {
  return uniform(rnd(), rnd());
}

__device__ float randLikef(Philox& rnd) {
  return uniformf(rnd());
}

__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int remainder(int a, int b) {
  auto mod = a % b;
  if ((mod != 0) && ((b < 0) != (mod < 0)))
    mod += b;
  return mod;
}

__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
  return a % b;
}

__device__ constexpr int fmod(int a, int b) {
  return a % b;
}

__device__ constexpr double fmod(double a, double b) {
  return ::fmod(a, b);
}

__device__ constexpr float fmod(float a, float b) {
  return ::fmod(a, b);
}

template <typename T>
__device__ T pow(T a, T b) {
  if (b < 0) {
    if (a == 1) {
      return 1;
    } else if (a == -1) {
      auto negative = (-b) % static_cast<T>(2);
      return negative ? -1 : 1;
    } else {
      return 0;
    }
  } else {
    T result = 1;
    while (b) {
      if (b & 1) {
        result *= a;
      }
      b /= 2;
      a *= a;
    }
    return result;
  }
}

template int pow<int>(int a, int b);
template int64_t pow<int64_t>(int64_t a, int64_t b);

template <>
float pow<float>(float a, float b) {
  return ::pow(a, b);
}

template <>
double pow<double>(double a, double b) {
  return ::pow(a, b);
}

namespace index_utils {

// Utility functions

// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
  return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}

// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = 0;
  if (Z)
    offset += idx.z;
  if (Y)
    offset = offset * dim.y + idx.y;
  if (X)
    offset = offset * dim.x + idx.x;
  return offset;
}

// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
  nvfuser_index_t offset = idx.z;
  offset = offset * dim.y + idx.y;
  offset = offset * dim.x + idx.x;
  return offset;
}

// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
  return dim3{
      X ? (unsigned)dim.x : 1U,
      Y ? (unsigned)dim.y : 1U,
      Z ? (unsigned)dim.z : 1U};
}

// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
  return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == 0;
  if (Y)
    isZero = isZero && idx.y == 0;
  if (Z)
    isZero = isZero && idx.z == 0;
  return isZero;
}

// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
  bool isZero = true;
  if (X)
    isZero = isZero && idx.x == dim.x - 1;
  if (Y)
    isZero = isZero && idx.y == dim.y - 1;
  if (Z)
    isZero = isZero && idx.z == dim.z - 1;
  return isZero;
}

} // namespace index_utils


// Default block synchronization. Just use __barrier_sync
namespace block_sync {

__forceinline__ __device__ void init() {}

// Thread-block synchronization
__forceinline__ __device__ void sync() {
  __barrier_sync(0);
}

} // namespace block_sync

namespace grid_sync {

// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))

template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
  return global_val;
}

// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
  // Finish all global memory transactions before synchronizing
  __threadfence();

  // Synchronize all threads in a block before synchronizing blocks
  block_sync::sync();

  // Only allow linear_tid == 0 to participate in the synchronization
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // Get increment value, only want a single block to have the large
    // increment, doesn't really matter which one, the goal is to flip/flop the
    // first bit of a uint64_t value, since our semaphores are actualy int64_t
    // we will just reinterpret_cast it to act as a uint64_t
    uint64_t semaphore_increment = 1;

    // Makes the assumption that blocks are in increasing order, this is not
    // guaranteed by CUDA but this is the current behavior, and unlikely to
    // change.
    bool last_block =
        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    if (last_block) {
      semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
    }

    uint64_t oldArrive =
        atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);

    // If for persistent kernels, lock all blocks until the semaphore has been
    // reached. Make sure we access semaphore as a volatile address so we get
    // the global memory updates.
    while ((PERSISTENT || last_block) &&
           ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
               0) {
      // Put a sleep here so we have some breaks in probing the global
      // semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
      __nanosleep(200);
#else
      // __nanosleep is not available for sm < 70
      assert(false);
#endif
    }
  }

  // Sync block to make sure all other threads are waiting on the sync
  block_sync::sync();
}
} // namespace grid_sync

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
//  EXAMPLE USAGE:
//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
//    (output[output_index], inputs[input_index],
//      [] __device__ (T& a, const T b) { a += b; });
//
// Note: We agressively template functions taking dim3 in the functions below
//       because ROCM uses different types for the various dim3 and maps them
//       directly to intrinsics, but they're dim3 when used after modification.
//
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  // Initialize shared memory
  if (read_pred) {
    shared_mem[smem_offset] = inp_val;
  } else {
    shared_mem[smem_offset] = init_val;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2 for the tree reduction:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      reduction_op(shared_mem[smem_offset], shared_mem[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T result = out;
    reduction_op(result, shared_mem[smem_offset]);
    if (reduction_size > 1) {
      reduction_op(result, shared_mem[smem_offset + 1]);
    }
    out = result;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename Func,
    typename _dim3,
    typename _dim3_2>
__device__ void blockReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, T, Func, _dim3, _dim3_2>(
      out,
      inp_val,
      reduction_op,
      thread_idx,
      block_dim,
      shared_mem,
      read_write_pred,
      read_write_pred,
      init_val);
}

// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
//   OUT(thread_idx, block_idx_out) =
//     Reduction of IN(thread_idx, block_idx) for
//       all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.

namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T,
    typename Func>
__device__ void gridReduceLastBlock(
    T& out,
    const T* in,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    Func reduction_op,
    T* shared_buf,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp = init_val;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    reduction_op(inp, in[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_tmp = init_val;
  blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_tmp,
      inp,
      reduction_op,
      threadIdx,
      blockDim,
      shared_buf,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    reduction_op(out, inp_tmp);
  }
}

// Reduces per-thread values across thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
//   These are set to true if the dimension in the block has not been reduced
//   previously in producer tensors, and does not participate in the reduction
//   (right now they can't), so it's just a "pure" iteration domain as far as
//   the grid reduce is concerned.
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
//   the result of the grid reduction will be broadcasted and used across the
//   grid. These requires cross grid communication and the grid synchronizations
//   here to actually synchronize across the entire grid. When false the grid is
//   not synchronized, the last block just waits for everyone else to finish and
//   the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
// with the sub regions of other thread blocks. We call it a reduction block.
// E.g.,
//
// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
//
// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
// participate in the cross-block reductions. The reduction block in this case
// is equivalent to the thread block.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename Func>
__device__ void gridReduce(
    T& out,
    const T& inp_val,
    Func reduction_op,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf[work_buf_offset] = inp_val;
    } else {
      work_buf[work_buf_offset] = init_val;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // Cleanup with block reduction
    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out,
        (T*)work_buf,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        reduction_op,
        shared_buf,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace reduction

#undef isize
#undef ioffset

namespace grid_broadcast {

// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
//   dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
//   dimensions
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    typename T>
__device__ void broadcast(
    T& out,
    const T& inp_val,
    volatile T* work_buf,
    Tensor<int64_t, 1> sync_flags,
    bool read_write_pred) {
  // Number of values broadcasted in the grid dimensions
  const auto grid_seg_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the broadcast we're performing out of the grid_seg_size
  const auto grid_seg_idx =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads not participating in a broadcast dimension, this is the
  // number of thread entries to expect in the work buffer, therefore a striding
  const auto block_stride =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  // Which broadcast in the block this is to line up the entry with the work
  // buffer
  const auto thread_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
      (!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
      (!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
      (!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
      (!Z_THREAD || threadIdx.z == 0);

  if (has_valid_data && read_write_pred) {
    work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
    __threadfence();
  }

  bool null = false;
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);

  if (read_write_pred) {
    out = work_buf[grid_seg_idx * block_stride + thread_offset];
  }

  // Make sure everyone has read from the buffer before continuing the kernel
  // and potentially overwriting
  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true>(
      sync_flags[grid_seg_idx], grid_seg_size);
}
} // namespace grid_broadcast


namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
    T& out,
    const T& inp_val,
    T* shared_mem,
    bool read_write_pred) {
  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);

  const auto shared_offset =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  if (has_valid_data && read_write_pred) {
    shared_mem[shared_offset] = inp_val;
  }

  block_sync::sync();

  if (read_write_pred) {
    out = shared_mem[shared_offset];
  }

  block_sync::sync();
}

} // namespace broadcast

// -----------------------------------------------------------------------------------------------
//  Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
    T& a_avg,
    T& a_M2,
    TN& a_N,
    const T& b_avg,
    const T& b_M2,
    TN b_N) {
  if (b_N == 0) {
    return;
  }
  TN ab_N = a_N + b_N;
  T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
  T delta = b_avg - a_avg;
  a_avg += delta * b_N_div_ab_N;
  a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
  a_N = ab_N;
}

// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // If this thread will output a final result
  bool should_write =
      index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(thread_idx);

  // Size of the reduction segments
  unsigned int reduction_size =
      index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);

  // Index into the reduction segment
  unsigned int reduction_tid =
      index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
          thread_idx, block_dim);

  // Index of the reduction segment
  unsigned int reduction_idx =
      index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
          thread_idx, block_dim);

  // Offset into smem for the current thread
  unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;

  if (read_pred) {
    shared_mem_avg[smem_offset] = in_avg;
    shared_mem_M2[smem_offset] = in_M2;
    shared_mem_N[smem_offset] = in_N;
  } else {
    shared_mem_avg[smem_offset] = init_val;
    shared_mem_M2[smem_offset] = init_val;
    shared_mem_N[smem_offset] = 0;
  }

  block_sync::sync();
  // Reduce down to nearest power of 2:
  int np2 = 1 << (31 - __clz(reduction_size));

  if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
    welfordCombine(
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset],
        shared_mem_avg[smem_offset + np2],
        shared_mem_M2[smem_offset + np2],
        shared_mem_N[smem_offset + np2]);
  }
  block_sync::sync();

  // loop peel the final iteration to save one syncthread for the end
  for (int factor = np2 / 2; factor > 1; factor >>= 1) {
    if (reduction_tid < factor) {
      welfordCombine(
          shared_mem_avg[smem_offset],
          shared_mem_M2[smem_offset],
          shared_mem_N[smem_offset],
          shared_mem_avg[smem_offset + factor],
          shared_mem_M2[smem_offset + factor],
          shared_mem_N[smem_offset + factor]);
    }
    block_sync::sync();
  }

  if (should_write && write_pred) {
    T res_avg = out_avg;
    T res_M2 = out_M2;
    TN res_N = out_N;
    welfordCombine(
        res_avg,
        res_M2,
        res_N,
        shared_mem_avg[smem_offset],
        shared_mem_M2[smem_offset],
        shared_mem_N[smem_offset]);
    if (reduction_size > 1) {
      welfordCombine(
          res_avg,
          res_M2,
          res_N,
          shared_mem_avg[smem_offset + 1],
          shared_mem_M2[smem_offset + 1],
          shared_mem_N[smem_offset + 1]);
    }
    out_avg = res_avg;
    out_M2 = res_M2;
    out_N = res_N;
  }
  block_sync::sync();
}

// Use the same pred for both reads and writes
template <
    bool X_REDUCE,
    bool Y_REDUCE,
    bool Z_REDUCE,
    typename T,
    typename TN,
    typename _dim3,
    typename _dim3_2>
__inline__ __device__ void blockWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& in_avg,
    const T& in_M2,
    const TN& in_N,
    const _dim3& thread_idx,
    const _dim3_2& block_dim,
    T* shared_mem_avg,
    T* shared_mem_M2,
    TN* shared_mem_N,
    bool read_write_pred,
    T init_val) {
  blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, T, TN, _dim3, _dim3_2>(
      out_avg,
      out_M2,
      out_N,
      in_avg,
      in_M2,
      in_N,
      thread_idx,
      block_dim,
      shared_mem_avg,
      shared_mem_M2,
      shared_mem_N,
      read_write_pred,
      read_write_pred,
      init_val);
}
// -----------------------------------------------------------------------------------------------
//  Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T, typename TN>
__device__ void gridWelfordLastBlock(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T* in_avg,
    const T* in_M2,
    const TN* in_N,
    const nvfuser_index_t
        grid_reduction_segment_size, // Number of reductions across
                                     // grid reduce dimensions
    const nvfuser_index_t
        block_reduction_segment_size, // Number of reductions across the block
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool write_pred,
    T init_val) {
  // We have to do num_reductions across reduction_size. The reductions are
  // contiguous, but offset by reduction_size. There is an entry in "in" for
  // every block, and every thread marked as true. Threads in dimensions marked
  // as false can be used to parallelize the reduction.

  // Find the reduction id of the participating threads
  const auto block_reduction_segment_idx =
      index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
          threadIdx, blockDim);

  // Find an id associated within a reduction segment for all
  // "non-participating" threads, which will parallelize the reductions for the
  // "participating" threads
  const auto id_in_block_segment =
      index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
          threadIdx, blockDim);

  // Stride by the "non-participating" threads
  const auto input_stride_for_thread_in_segment =
      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);

  T inp_avg = init_val;
  T inp_M2 = init_val;
  TN inp_N = 0;

  // Block stride across the reduction until we only have one value per thread
  for (nvfuser_index_t reduction_i = id_in_block_segment;
       reduction_i < grid_reduction_segment_size;
       reduction_i += input_stride_for_thread_in_segment) {
    auto work_buf_offset = reduction_i * block_reduction_segment_size +
        block_reduction_segment_idx;
    welfordCombine(
        inp_avg,
        inp_M2,
        inp_N,
        in_avg[work_buf_offset],
        in_M2[work_buf_offset],
        in_N[work_buf_offset]);
  }

  // Block reduce the per thread values into per "participating" thread values
  T inp_avg_tmp = init_val;
  T inp_M2_tmp = init_val;
  TN inp_N_tmp = 0;
  blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD>(
      inp_avg_tmp,
      inp_M2_tmp,
      inp_N_tmp,
      inp_avg,
      inp_M2,
      inp_N,
      threadIdx,
      blockDim,
      shared_buf_avg,
      shared_buf_M2,
      shared_buf_N,
      true,
      init_val);
  const bool should_write = (X_THREAD || threadIdx.x == 0) &&
      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
  if (should_write && write_pred) {
    welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
  }
}

//    Grid welford combine
template <
    bool X_BLOCK,
    bool Y_BLOCK,
    bool Z_BLOCK,
    bool X_THREAD,
    bool Y_THREAD,
    bool Z_THREAD,
    bool PERSISTENT_REDUCTION,
    typename T,
    typename TN>
__device__ void gridWelford(
    T& out_avg,
    T& out_M2,
    TN& out_N,
    const T& inp_avg,
    const T& inp_M2,
    const TN& inp_N,
    volatile T* work_buf_avg,
    volatile T* work_buf_M2,
    volatile TN* work_buf_N,
    Tensor<int64_t, 1> sync_flags,
    T* shared_buf_avg,
    T* shared_buf_M2,
    TN* shared_buf_N,
    bool read_pred,
    bool write_pred,
    T init_val) {
  // Number of values to reduce in the reduction segment
  const auto grid_reduction_segment_size =
      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);

  // Index of the reduction we're performing out of the
  // grid_reduction_segment_size
  const auto idx_in_grid_segment =
      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
          blockIdx, gridDim);

  // Number of threads we can use in final reduction, Seems to assume all
  // threads in the block participate
  const auto block_reduction_segment_size =
      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);

  // advance to the offset for this segment
  // index of reduction * size of the reduction * size of threads
  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;
  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
      block_reduction_segment_size;

  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
      (Z_THREAD || threadIdx.z == 0)) {
    auto block_offset =
        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
    auto thread_offset =
        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
            threadIdx, blockDim);
    auto work_buf_offset =
        block_offset * block_reduction_segment_size + thread_offset;
    if (read_pred) {
      work_buf_avg[work_buf_offset] = inp_avg;
      work_buf_M2[work_buf_offset] = inp_M2;
      work_buf_N[work_buf_offset] = inp_N;
    } else {
      work_buf_avg[work_buf_offset] = init_val;
      work_buf_M2[work_buf_offset] = init_val;
      work_buf_N[work_buf_offset] = 0;
    }
  }

  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);

  bool last_block =
      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);

  if (last_block) {
    // final reduction
    gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
        out_avg,
        out_M2,
        out_N,
        (T*)work_buf_avg,
        (T*)work_buf_M2,
        (TN*)work_buf_N,
        grid_reduction_segment_size,
        block_reduction_segment_size,
        shared_buf_avg,
        shared_buf_M2,
        shared_buf_N,
        write_pred,
        init_val);
  }

  if (PERSISTENT_REDUCTION) {
    // Make sure we're done with global memory before we allow the kernel to
    // continue
    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
  }
}

} // namespace welford

#undef isize
#undef ioffset

namespace warp {

template <
    bool SINGLE_WARP,
    typename T,
    typename Func,
    typename _dim3ti,
    typename _dim3bd>
__device__ void warpReduceTIDX(
    T& out,
    const T& inp_val,
    Func reduction_op,
    const _dim3ti& thread_idx,
    const _dim3bd& block_dim,
    T* shared_mem,
    bool read_write_pred,
    T init_val) {
  constexpr int WARP_SIZE = 32;

  // Assume input padded to multiples of a warp
  T reduce_val = init_val;

  // Do warp reduction
  if (read_write_pred) {
    reduce_val = inp_val;
  }

  // Reduce within each warp
  for (int i = 16; i >= 1; i /= 2) {
    reduction_op(
        reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, WARP_SIZE));
  }

  // Reduce across warp if needed
  // Load value to shared mem
  if (!SINGLE_WARP) {
    unsigned int warp_idx = thread_idx.x / WARP_SIZE;
    unsigned int lane_idx = thread_idx.x % WARP_SIZE;
    unsigned int reduce_group_id = thread_idx.z * block_dim.y + thread_idx.y;
    bool is_warp_head = lane_idx == 0;
    unsigned int reduction_size = block_dim.x;
    unsigned int num_of_warps = reduction_size / WARP_SIZE;
    unsigned int smem_offset = reduce_group_id * num_of_warps;

    block_sync::sync();

    if (read_write_pred && is_warp_head) {
      shared_mem[smem_offset + warp_idx] = reduce_val;
    }

    block_sync::sync();

    if (warp_idx == 0) {
      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
      //  Should be true for long enough.
      assert(num_of_warps <= 32);

      reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
                                           : init_val;

      // Reduce within warp 0
      for (int i = 16; i >= 1; i /= 2) {
        reduction_op(
            reduce_val, __shfl_xor_sync(0xffffffff, reduce_val, i, 32));
      }
    }

    if (is_warp_head) {
      reduction_op(out, reduce_val);
    }
  } else {
    reduction_op(out, reduce_val);
  }
}

} // namespace warp

// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
// Eager mode clients should not include this file directly, instead,
// they should #include <ATen/cuda/CUDAGeneratorImpl.h>, which has a #pragma once.

// Stores RNG state values. Passed as a kernel argument.
// See Note [CUDA Graph-safe RNG states].
//
// The raw definition lives in its own file so jit codegen can easily copy it.
namespace at {

struct PhiloxCudaState {
  PhiloxCudaState() = default;
  // Called if graph capture is not underway
  PhiloxCudaState(uint64_t seed,
                  uint64_t offset) {
    seed_ = seed;
    offset_.val = offset;
  }
  // Called if graph capture is underway
  PhiloxCudaState(uint64_t seed,
                  int64_t* offset_extragraph,
                  uint32_t offset_intragraph) {
    seed_ = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
    captured_ = true;
  }

  // Public members, directly accessible by at::cuda::philox::unpack.
  // If we made them private with getters/setters, the getters/setters
  // would have to be __device__, and we can't declare __device__ in ATen.
  union Payload {
    uint64_t val;
    int64_t* ptr;
  };

  uint64_t seed_ = 0;
  Payload offset_;
  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
};

} // namespace at
__global__ void kernel212(Tensor<bool, 0> T0, Tensor<int, 0> T1, Tensor<int, 0> T2, Tensor<int, 0> T3) {
  T3[0]
     = where(T0[0], T1[0], T2[0]);
}
}

CUDA NVRTC compile error: default_program(1670): error: more than one instance of overloaded function "CudaCodeGen::where" matches the argument list:
            function "CudaCodeGen::where(__nv_bool, double, double)"
            function "CudaCodeGen::where(__nv_bool, float, float)"
            function "CudaCodeGen::where(__nv_bool, CudaCodeGen::int64_t, CudaCodeGen::int64_t)"
            argument types are: (__nv_bool, int, int)

1 error detected in the compilation of "default_program".


======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_softmin_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: kReductionAxis >= 0 && kReductionAxis < kNumberOfDimsINTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/ops/normalization.cpp":15, please report a bug to PyTorch.

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid resource handle


======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_sum_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid resource handle


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_complex128 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_complex64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Expected kernel_ to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_int8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time


======================================================================
ERROR: test_nvfuser_correctness__masked_var_cuda_uint8 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: graph_cache_.count(kernel_id) > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/manager.cpp":108, please report a bug to PyTorch. graph cache miss at run time


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_acos_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_add_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_addcmul_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: int

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: bool

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Could not generate a max op for tensor with type: int

======================================================================
ERROR: test_nvfuser_correctness_amax_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Tried to reduce a 0-dim tensor

======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bitwise_not_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: double to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_channels_last_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_bool (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: sorted_exprs.size() > 0INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp":1403, please report a bug to PyTorch. Error during expression sorting, no expressions produced.


======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: float to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: double to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_bool_cuda_int64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: bool

======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_bfloat16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float16 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_float64 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: Tried to vectorize a dim resulting in a word size of 0 however, vector sizes only upto and including 16 bytes are supported.


======================================================================
ERROR: test_nvfuser_correctness_clamp_scalar_cuda_int32 (__main__.TestCudaFuserOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 81, in _script_method_graph_for
    dbs = parent.get_debug_state()
RuntimeError: optimized_plan_INTERNAL ASSERT FAILED at "../torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp":632, please report a bug to PyTorch.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 1753, in wrapper
    method(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 381, in instantiated_test
    raise rte
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 376, in instantiated_test
    result = test(self, **param_kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_device_type.py", line 753, in test_wrapper
    return test(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/test/test_jit_cuda_fuser.py", line 3232, in test_nvfuser_correctness
    trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/jit_metaprogramming_utils.py", line 369, in traced_fn
    traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 77, in _graph_for
    return _script_method_graph_for(self, self, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/jit/_fuser.py", line 106, in _script_method_graph_for
    self(*args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 427, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "/fsx/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 424, in prof_callable
    return callable(*args, **kwargs)
RuntimeError: Illegal Cast value from  DataType: int64_t to DataType: int

====