andrewor14/compare_backward_numerics_pr_116092

## compare_backward_numerics_pr_116092
# Debug test failure in https://github.com/pytorch/pytorch/pull/116092 for:
# python test/test_decomp.py -k test_comprehensive_batch_norm_with_update_cuda_bfloat16
# Set up args (these are the exact tensors saved from the decomp test)
# All tensors in args16 are bfloat16
# All tensors in args64 are the same values in args16 upcast to float64
>>> args16
[tensor([[-0.5468750000],
        [ 0.7812500000]], device='cuda:0', dtype=torch.bfloat16), tensor([[-1.5234375000],
        [-4.1875000000]], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True), tensor([8.8125000000], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True), tensor([-2.4687500000], device='cuda:0', dtype=torch.bfloat16), tensor([3.7968750000], device='cuda:0', dtype=torch.bfloat16), tensor([-2.8554687500], device='cuda:0'), tensor([0.7507309914], device='cuda:0'), True, 1e-05, [True, True, True], tensor([], device='cuda:0', dtype=torch.uint8), False]
>>> args64
[tensor([[-0.5468750000],
        [ 0.7812500000]], device='cuda:0', dtype=torch.float64), tensor([[-1.5234375000],
        [-4.1875000000]], device='cuda:0', dtype=torch.float64), tensor([8.8125000000], device='cuda:0', dtype=torch.float64), tensor([-2.4687500000], device='cuda:0', dtype=torch.float64), tensor([3.7968750000], device='cuda:0', dtype=torch.float64), tensor([-2.8554687500], device='cuda:0', dtype=torch.float64), tensor([0.7507309914], device='cuda:0', dtype=torch.float64), True, 1e-05, [True, True, True], tensor([], device='cuda:0', dtype=torch.uint8), False]
# Set up decomp test for batch_norm_backward
# The differences are the exact same numbers as the ones in the failed test
>>> x16 = torch.ops.aten.batch_norm_backward.default(*args16)
>>> x64 = torch.ops.aten.batch_norm_backward.default(*args64)
>>> d16 = torch._decomp.decompositions.batch_norm_backward(*args16)
>>> x16[0] - x64[0]
tensor([[-3.0807249121e-08],
        [ 3.0807249121e-08]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SubBackward0>)
>>> d16[0] - x64[0]
tensor([[-1.5001653867e-07],
        [ 1.5001653867e-07]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SubBackward0>)
# Set up same decomp test for native_batch_norm_backward
# We get the exact same result as batch_norm_backward
>>> n16 = torch.ops.aten.native_batch_norm_backward.default(*args16[:-2])
>>> n64 = torch.ops.aten.native_batch_norm_backward.default(*args64[:-2])
>>> nd16 = torch._decomp.decompositions.native_batch_norm_backward(*args16[:-2])
>>> n16[0] - n64[0]
tensor([[-3.0807249121e-08],
        [ 3.0807249121e-08]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SubBackward0>)
>>> nd16[0] - n64[0]
tensor([[-1.5001653867e-07],
        [ 1.5001653867e-07]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SubBackward0>)
# Sanity check: compare `batch_norm_backward` and `native_batch_norm_backward`
# outputs directly. They are the same.
>>> x16[0] == n16[0]
tensor([[True],
        [True]], device='cuda:0')
>>> x64[0] == n64[0]
tensor([[True],
        [True]], device='cuda:0')
	# Debug test failure in https://github.com/pytorch/pytorch/pull/116092 for:
	# python test/test_decomp.py -k test_comprehensive_batch_norm_with_update_cuda_bfloat16
	# Set up args (these are the exact tensors saved from the decomp test)
	# All tensors in args16 are bfloat16
	# All tensors in args64 are the same values in args16 upcast to float64
	>>> args16
	[tensor([[-0.5468750000],
	[ 0.7812500000]], device='cuda:0', dtype=torch.bfloat16), tensor([[-1.5234375000],
	[-4.1875000000]], device='cuda:0', dtype=torch.bfloat16,
	requires_grad=True), tensor([8.8125000000], device='cuda:0', dtype=torch.bfloat16,
	requires_grad=True), tensor([-2.4687500000], device='cuda:0', dtype=torch.bfloat16), tensor([3.7968750000], device='cuda:0', dtype=torch.bfloat16), tensor([-2.8554687500], device='cuda:0'), tensor([0.7507309914], device='cuda:0'), True, 1e-05, [True, True, True], tensor([], device='cuda:0', dtype=torch.uint8), False]
	>>> args64
	[tensor([[-0.5468750000],
	[ 0.7812500000]], device='cuda:0', dtype=torch.float64), tensor([[-1.5234375000],
	[-4.1875000000]], device='cuda:0', dtype=torch.float64), tensor([8.8125000000], device='cuda:0', dtype=torch.float64), tensor([-2.4687500000], device='cuda:0', dtype=torch.float64), tensor([3.7968750000], device='cuda:0', dtype=torch.float64), tensor([-2.8554687500], device='cuda:0', dtype=torch.float64), tensor([0.7507309914], device='cuda:0', dtype=torch.float64), True, 1e-05, [True, True, True], tensor([], device='cuda:0', dtype=torch.uint8), False]
	# Set up decomp test for batch_norm_backward
	# The differences are the exact same numbers as the ones in the failed test
	>>> x16 = torch.ops.aten.batch_norm_backward.default(*args16)
	>>> x64 = torch.ops.aten.batch_norm_backward.default(*args64)
	>>> d16 = torch._decomp.decompositions.batch_norm_backward(*args16)
	>>> x16[0] - x64[0]
	tensor([[-3.0807249121e-08],
	[ 3.0807249121e-08]], device='cuda:0', dtype=torch.float64,
	grad_fn=<SubBackward0>)
	>>> d16[0] - x64[0]
	tensor([[-1.5001653867e-07],
	[ 1.5001653867e-07]], device='cuda:0', dtype=torch.float64,
	grad_fn=<SubBackward0>)
	# Set up same decomp test for native_batch_norm_backward
	# We get the exact same result as batch_norm_backward
	>>> n16 = torch.ops.aten.native_batch_norm_backward.default(*args16[:-2])
	>>> n64 = torch.ops.aten.native_batch_norm_backward.default(*args64[:-2])
	>>> nd16 = torch._decomp.decompositions.native_batch_norm_backward(*args16[:-2])
	>>> n16[0] - n64[0]
	tensor([[-3.0807249121e-08],
	[ 3.0807249121e-08]], device='cuda:0', dtype=torch.float64,
	grad_fn=<SubBackward0>)
	>>> nd16[0] - n64[0]
	tensor([[-1.5001653867e-07],
	[ 1.5001653867e-07]], device='cuda:0', dtype=torch.float64,
	grad_fn=<SubBackward0>)
	# Sanity check: compare `batch_norm_backward` and `native_batch_norm_backward`
	# outputs directly. They are the same.
	>>> x16[0] == n16[0]
	tensor([[True],
	[True]], device='cuda:0')
	>>> x64[0] == n64[0]
	tensor([[True],
	[True]], device='cuda:0')