HDCharles/comparison.py

## comparison.py
######################################################################
#                            Comparing Torchao                       #
#                            and BitsandBytes                        #
######################################################################
# Set up Your Environment
# --------------------------------
#
# First, let's configure your environment. This guide requires you to use CUDA 12.1.
# We have run this tutorial on an A100-PG509-200 power limited to 330.00 W. If you
# are using a different hardware, you might see different performance numbers.
#
#
# .. code-block:: bash
#
#    > conda create -n myenv python=3.10
#    > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
#    > pip install git+https://github.com/pytorch-labs/ao.git
#
#
# This was run on an A100-PG509-200 power limited to 330.00 W


import bitsandbytes
import torch


from torch.utils.benchmark import Timer
from torchao.quantization import (
   change_linear_weights_to_int4_woqtensors,
   change_linear_weights_to_int8_dqtensors,
   change_linear_weights_to_int8_woqtensors,
)
torch._inductor.config.use_mixed_mm = True


@torch.no_grad()
def benchmark(f, *args, **kwargs):
   for _ in range(3):
       f(*args, **kwargs)
       torch.cuda.synchronize()


   torch.cuda.reset_peak_memory_stats()
   t0 = Timer(
       stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
   )
   res = t0.blocked_autorange()
   return {'time':res.median * 1e3, 'memory': torch.cuda.max_memory_allocated()/1e9}


# i, j, k
shapes = [
   (78400, 1280, 3840, "SAM"),
   (78400, 1280, 1280, "SAM"),
   (65536, 1280, 5120, "SAM"),
   (65536, 5120, 1280, "SAM"),
   (65536, 1280, 3840, "SAM"),
   (65536, 1280, 1280, "SAM"),
   (1, 4096, 4096, "LLAMA"),
   (1, 4096, 11008, "LLAMA"),
   (1, 11008, 4096, "LLAMA"),
   (1, 4096, 12288, "LLAMA"),
   (1, 4096, 32000, "LLAMA"),
]


for i,j,k,m in shapes:
   bias = (m == "SAM")
   res={}
   image = torch.randn(i, j, device='cuda').to(torch.bfloat16)


   if m == "SAM":
       lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
       change_linear_weights_to_int8_dqtensors(lin)
       lin_c = torch.compile(lin, mode='max-autotune')
       res["ao-int8dq-c"] = benchmark(lin_c, image)
       del lin, lin_c


       image=image.to(torch.float16)
       lin = bitsandbytes.nn.Linear8bitLt(j,k, bias=bias, has_fp16_weights=False).to(0).cuda()
       res["bb-int8"] = benchmark(lin, image)
       del lin


   if m == "LLAMA":
       lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
       change_linear_weights_to_int4_woqtensors(lin, groupsize=64)
       res["ao-int4wo"] = benchmark(lin, image)
       del lin


       lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
       change_linear_weights_to_int8_woqtensors(lin)
       lin_c = torch.compile(lin, mode='max-autotune')
       res["ao-int8wo-c"] = benchmark(lin_c, image)
       del lin, lin_c


       image=image.to(torch.float16)
       lin = bitsandbytes.nn.Linear4bit(j,k, bias=bias, device='cuda').cuda()
       res["bb-int4"] = benchmark(lin, image)
       del lin


   image=image.to(torch.bfloat16)
   lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
   res["bf16"] = benchmark(lin, image)
   lin_c = torch.compile(lin, mode='max-autotune')
   res["bf16-c"] = benchmark(lin_c, image)


   del lin, lin_c


   perf = "perf:"
   mem = "mem: "


   for key, res_data in res.items():
       perf += f"{key}({res_data['time']:0.2f}ms) "
       mem += f"{key}({res_data['memory']:0.2f}GB) "


   print(f"for shape {i,j,k}, with bias, from model {m}\n{perf}\n{mem}")
	######################################################################
	# Comparing Torchao #
	# and BitsandBytes #
	######################################################################
	# Set up Your Environment
	# --------------------------------
	#
	# First, let's configure your environment. This guide requires you to use CUDA 12.1.
	# We have run this tutorial on an A100-PG509-200 power limited to 330.00 W. If you
	# are using a different hardware, you might see different performance numbers.
	#
	#
	# .. code-block:: bash
	#
	# > conda create -n myenv python=3.10
	# > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
	# > pip install git+https://github.com/pytorch-labs/ao.git
	#
	#
	# This was run on an A100-PG509-200 power limited to 330.00 W


	import bitsandbytes
	import torch


	from torch.utils.benchmark import Timer
	from torchao.quantization import (
	change_linear_weights_to_int4_woqtensors,
	change_linear_weights_to_int8_dqtensors,
	change_linear_weights_to_int8_woqtensors,
	)
	torch._inductor.config.use_mixed_mm = True


	@torch.no_grad()
	def benchmark(f, args, *kwargs):
	for _ in range(3):
	f(args, *kwargs)
	torch.cuda.synchronize()


	torch.cuda.reset_peak_memory_stats()
	t0 = Timer(
	stmt="f(args, *kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
	)
	res = t0.blocked_autorange()
	return {'time':res.median * 1e3, 'memory': torch.cuda.max_memory_allocated()/1e9}


	# i, j, k
	shapes = [
	(78400, 1280, 3840, "SAM"),
	(78400, 1280, 1280, "SAM"),
	(65536, 1280, 5120, "SAM"),
	(65536, 5120, 1280, "SAM"),
	(65536, 1280, 3840, "SAM"),
	(65536, 1280, 1280, "SAM"),
	(1, 4096, 4096, "LLAMA"),
	(1, 4096, 11008, "LLAMA"),
	(1, 11008, 4096, "LLAMA"),
	(1, 4096, 12288, "LLAMA"),
	(1, 4096, 32000, "LLAMA"),
	]




	for i,j,k,m in shapes:
	bias = (m == "SAM")
	res={}
	image = torch.randn(i, j, device='cuda').to(torch.bfloat16)


	if m == "SAM":
	lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
	change_linear_weights_to_int8_dqtensors(lin)
	lin_c = torch.compile(lin, mode='max-autotune')
	res["ao-int8dq-c"] = benchmark(lin_c, image)
	del lin, lin_c


	image=image.to(torch.float16)
	lin = bitsandbytes.nn.Linear8bitLt(j,k, bias=bias, has_fp16_weights=False).to(0).cuda()
	res["bb-int8"] = benchmark(lin, image)
	del lin


	if m == "LLAMA":
	lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
	change_linear_weights_to_int4_woqtensors(lin, groupsize=64)
	res["ao-int4wo"] = benchmark(lin, image)
	del lin


	lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
	change_linear_weights_to_int8_woqtensors(lin)
	lin_c = torch.compile(lin, mode='max-autotune')
	res["ao-int8wo-c"] = benchmark(lin_c, image)
	del lin, lin_c


	image=image.to(torch.float16)
	lin = bitsandbytes.nn.Linear4bit(j,k, bias=bias, device='cuda').cuda()
	res["bb-int4"] = benchmark(lin, image)
	del lin


	image=image.to(torch.bfloat16)
	lin=torch.nn.Linear(j,k, bias=bias).to(torch.bfloat16).cuda()
	res["bf16"] = benchmark(lin, image)
	lin_c = torch.compile(lin, mode='max-autotune')
	res["bf16-c"] = benchmark(lin_c, image)


	del lin, lin_c




	perf = "perf:"
	mem = "mem: "


	for key, res_data in res.items():
	perf += f"{key}({res_data['time']:0.2f}ms) "
	mem += f"{key}({res_data['memory']:0.2f}GB) "


	print(f"for shape {i,j,k}, with bias, from model {m}\n{perf}\n{mem}")