cchan/bf16_convs.py

## bf16_convs.py
import time
import torch

torch.backends.cuda.matmul.allow_tf32 = False

GROUPS = 64

for k in [3, 5, 7]:
  x = torch.rand(16,64,1024,1024, device="cuda", dtype=torch.float32)
  weight = torch.rand(64,64//GROUPS,k,k, device="cuda", dtype=torch.float32)

  for ctx in [
    torch.autocast("cuda", enabled=False),
    torch.autocast("cuda", dtype=torch.float16),
    torch.autocast("cuda", dtype=torch.bfloat16),
  ]:

    with ctx:

      # Warmup
      for _ in range(10):
        torch.nn.functional.conv2d(x,weight,groups=GROUPS)

      torch.cuda.synchronize()
      start = time.time()
      for _ in range(100):
        torch.nn.functional.conv2d(x,weight,groups=GROUPS)

      torch.cuda.synchronize()
      print((time.time() - start) / 100)
	import time
	import torch

	torch.backends.cuda.matmul.allow_tf32 = False

	GROUPS = 64

	for k in [3, 5, 7]:
	x = torch.rand(16,64,1024,1024, device="cuda", dtype=torch.float32)
	weight = torch.rand(64,64//GROUPS,k,k, device="cuda", dtype=torch.float32)

	for ctx in [
	torch.autocast("cuda", enabled=False),
	torch.autocast("cuda", dtype=torch.float16),
	torch.autocast("cuda", dtype=torch.bfloat16),
	]:

	with ctx:

	# Warmup
	for _ in range(10):
	torch.nn.functional.conv2d(x,weight,groups=GROUPS)

	torch.cuda.synchronize()
	start = time.time()
	for _ in range(100):
	torch.nn.functional.conv2d(x,weight,groups=GROUPS)

	torch.cuda.synchronize()
	print((time.time() - start) / 100)