Laurawly/bench_cublas.py

## bench_cublas.py
import os

import numpy as np
import tvm
from tvm import te, auto_scheduler, topi
from tvm.topi.testing import conv2d_nchw_python
from tvm.contrib import cublas
target = tvm.target.Target('cuda')

M = 8192
N = 2304
K = 768
A = te.placeholder((M, K), name='data', dtype='float16')
B = te.placeholder((N, K), name='kernel', dtype='float16')
C = cublas.matmul(A, B, False, True, dtype='float32')

sch = te.create_schedule(C.op)
args = [A, B, C]
func = tvm.build(sch, args, target)

# Check correctness
data_np = np.random.uniform(size=(M, K)).astype(np.float16)
weight_np = np.random.uniform(size=(N, K)).astype(np.float16)
out_np = np.matmul(data_np, weight_np.T)

ctx = tvm.gpu()
data_tvm = tvm.nd.array(data_np, ctx=ctx)
weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
out_tvm = tvm.nd.array(np.zeros((M, N), dtype=C.dtype), ctx=ctx)
func(data_tvm, weight_tvm, out_tvm)

# Check results
np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

# Evaluate execution time
evaluator = func.time_evaluator(func.entry_name, ctx, number=100, repeat=10)
time = np.median(evaluator(data_tvm, weight_tvm, out_tvm).results)
print("shape", data_np.shape, weight_np.shape)
print("Execution time of this operator: %.3f ms" % (time * 1000))
print("Speed: %.3f TFLOPS" % (2 * (M*N*K) / time / 1e12))
	import os

	import numpy as np
	import tvm
	from tvm import te, auto_scheduler, topi
	from tvm.topi.testing import conv2d_nchw_python
	from tvm.contrib import cublas
	target = tvm.target.Target('cuda')

	M = 8192
	N = 2304
	K = 768
	A = te.placeholder((M, K), name='data', dtype='float16')
	B = te.placeholder((N, K), name='kernel', dtype='float16')
	C = cublas.matmul(A, B, False, True, dtype='float32')

	sch = te.create_schedule(C.op)
	args = [A, B, C]
	func = tvm.build(sch, args, target)

	# Check correctness
	data_np = np.random.uniform(size=(M, K)).astype(np.float16)
	weight_np = np.random.uniform(size=(N, K)).astype(np.float16)
	out_np = np.matmul(data_np, weight_np.T)

	ctx = tvm.gpu()
	data_tvm = tvm.nd.array(data_np, ctx=ctx)
	weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
	out_tvm = tvm.nd.array(np.zeros((M, N), dtype=C.dtype), ctx=ctx)
	func(data_tvm, weight_tvm, out_tvm)

	# Check results
	np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

	# Evaluate execution time
	evaluator = func.time_evaluator(func.entry_name, ctx, number=100, repeat=10)
	time = np.median(evaluator(data_tvm, weight_tvm, out_tvm).results)
	print("shape", data_np.shape, weight_np.shape)
	print("Execution time of this operator: %.3f ms" % (time * 1000))
	print("Speed: %.3f TFLOPS" % (2 * (MNK) / time / 1e12))