KireinaHoro/gemv.cc Secret

## gemv.cc
#include <cassert>
#include <cstdint>
#include <cstdio>

extern "C" int gemv_update(uint32_t *cc, uint32_t *aa, uint32_t *bb, int m, int l, int stride) {
	printf("%s invoked\n", __func__);
	assert(m == 16);
	for (int i = 0; i < m; ++i) {
		for (int j = 0; j < l; ++j) {
			cc[i] += aa[j] * bb[i * stride + j];
		}
	}
	return 0;
}

extern "C" int test_prologue() {
	printf("%s invoked\n", __func__);
	return 0;
}

extern "C" int test_epilogue() {
	printf("%s invoked\n", __func__);
	return 0;
}

## gemv.py
import tvm
from tvm import te
import numpy as np

#target = "llvm -device=riscv_cpu -target=riscv64-unknown-linux-gnu -mfloat-abi=soft"
target = "llvm"

dtype = "int32"

N, M, L = 4, 32, 64
A = te.placeholder((N, L), name='A', dtype=dtype)
B = te.placeholder((M, L), name='B', dtype=dtype)
k = te.reduce_axis((0, L), name='k')
C = te.compute((N, M), lambda i, j:
                te.sum(A[i, k] * B[j, k], axis=k), name='C')
s = te.create_schedule(C.op)

factor = 16
x, y = C.op.axis
z, = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)

def intrin_gemv(m, l):
    """
    (M, L) * (L, 1)
    """
    assert m == 16
    a = te.placeholder((l,), name='a', dtype=dtype)
    b = te.placeholder((m, l), name='b', dtype=dtype)
    k = te.reduce_axis((0, l), name='k')
    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[te.var("s1"), 1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[1])
    # Emit intrinsic function
    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(tvm.tir.call_extern("int32", "gemv_update",
                                cc.access_ptr("w"),
                                aa.access_ptr("r"),
                                bb.access_ptr("r"),
                                m, l, bb.strides[0]))
        return ib.get()
    with tvm.target.build_config(offset_factor=1):
        return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})

gemv = intrin_gemv(factor, L)
s[C].tensorize(yi, gemv)

# Implementation for the gemv_update function - C code
def intrinsic_impls(filename):
    with open(filename, "r") as f:
        cc_code = f.read()
    from tvm.contrib import util, clang
    clang.find_clang(required=True)
    temp = util.tempdir()
    ll_path = temp.relpath("gemv.ll")
    # Create LLVM ir from C++ source code
    import os
    ll_code = clang.create_llvm(cc_code, output=ll_path, options=[
        "-O3",
        f"-I{os.path.dirname(os.path.realpath(filename))}"
    ])
    return ll_code

s[C].pragma(s[C].op.axis[0], "import_llvm", intrinsic_impls("gemv.cc"))

s[C].pragma(yo, "prologue", "test_prologue")
s[C].pragma(yo, "epilogue", "test_epilogue")

print(tvm.lower(s, [A, B, C], simple_mode=True))

func = tvm.build(s, [A, B, C], target=target, name="gemv")

out_llvm_ir = "gemv.ll"

with open(out_llvm_ir, "w") as f:
    f.write(func.get_source())

print(f"Written LLVM IR to {out_llvm_ir}.")

from topi.util import get_const_tuple
dtype = A.dtype
ctx = tvm.context("cpu", 0)
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)

print("Kernel executed and passed test.")
	#include <cassert>
	#include <cstdint>
	#include <cstdio>

	extern "C" int gemv_update(uint32_t cc, uint32_t aa, uint32_t *bb, int m, int l, int stride) {
	printf("%s invoked\n", __func__);
	assert(m == 16);
	for (int i = 0; i < m; ++i) {
	for (int j = 0; j < l; ++j) {
	cc[i] += aa[j] * bb[i * stride + j];
	}
	}
	return 0;
	}

	extern "C" int test_prologue() {
	printf("%s invoked\n", __func__);
	return 0;
	}

	extern "C" int test_epilogue() {
	printf("%s invoked\n", __func__);
	return 0;
	}
	import tvm
	from tvm import te
	import numpy as np

	#target = "llvm -device=riscv_cpu -target=riscv64-unknown-linux-gnu -mfloat-abi=soft"
	target = "llvm"

	dtype = "int32"

	N, M, L = 4, 32, 64
	A = te.placeholder((N, L), name='A', dtype=dtype)
	B = te.placeholder((M, L), name='B', dtype=dtype)
	k = te.reduce_axis((0, L), name='k')
	C = te.compute((N, M), lambda i, j:
	te.sum(A[i, k] * B[j, k], axis=k), name='C')
	s = te.create_schedule(C.op)

	factor = 16
	x, y = C.op.axis
	z, = C.op.reduce_axis
	yo, yi = s[C].split(y, factor=factor)
	s[C].reorder(x, yo, yi, z)

	def intrin_gemv(m, l):
	"""
	(M, L) * (L, 1)
	"""
	assert m == 16
	a = te.placeholder((l,), name='a', dtype=dtype)
	b = te.placeholder((m, l), name='b', dtype=dtype)
	k = te.reduce_axis((0, l), name='k')
	c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')
	Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
	name="A",
	offset_factor=1,
	strides=[1])
	Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
	name="B",
	offset_factor=1,
	strides=[te.var("s1"), 1])
	Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
	name="C",
	offset_factor=1,
	strides=[1])
	# Emit intrinsic function
	def intrin_func(ins, outs):
	ib = tvm.tir.ir_builder.create()
	aa, bb = ins
	cc = outs[0]
	ib.emit(tvm.tir.call_extern("int32", "gemv_update",
	cc.access_ptr("w"),
	aa.access_ptr("r"),
	bb.access_ptr("r"),
	m, l, bb.strides[0]))
	return ib.get()
	with tvm.target.build_config(offset_factor=1):
	return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})

	gemv = intrin_gemv(factor, L)
	s[C].tensorize(yi, gemv)

	# Implementation for the gemv_update function - C code
	def intrinsic_impls(filename):
	with open(filename, "r") as f:
	cc_code = f.read()
	from tvm.contrib import util, clang
	clang.find_clang(required=True)
	temp = util.tempdir()
	ll_path = temp.relpath("gemv.ll")
	# Create LLVM ir from C++ source code
	import os
	ll_code = clang.create_llvm(cc_code, output=ll_path, options=[
	"-O3",
	f"-I{os.path.dirname(os.path.realpath(filename))}"
	])
	return ll_code

	s[C].pragma(s[C].op.axis[0], "import_llvm", intrinsic_impls("gemv.cc"))

	s[C].pragma(yo, "prologue", "test_prologue")
	s[C].pragma(yo, "epilogue", "test_epilogue")

	print(tvm.lower(s, [A, B, C], simple_mode=True))

	func = tvm.build(s, [A, B, C], target=target, name="gemv")

	out_llvm_ir = "gemv.ll"

	with open(out_llvm_ir, "w") as f:
	f.write(func.get_source())

	print(f"Written LLVM IR to {out_llvm_ir}.")

	from topi.util import get_const_tuple
	dtype = A.dtype
	ctx = tvm.context("cpu", 0)
	a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
	b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
	c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
	func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
	tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)

	print("Kernel executed and passed test.")