Skip to content

Instantly share code, notes, and snippets.

@KireinaHoro
Created March 12, 2020 10:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KireinaHoro/c959170557d75d81c52c116c3750d78a to your computer and use it in GitHub Desktop.
Save KireinaHoro/c959170557d75d81c52c116c3750d78a to your computer and use it in GitHub Desktop.
Example for prologue/epilogue pragma for TVM.
#include <cassert>
#include <cstdint>
#include <cstdio>
extern "C" int gemv_update(uint32_t *cc, uint32_t *aa, uint32_t *bb, int m, int l, int stride) {
printf("%s invoked\n", __func__);
assert(m == 16);
for (int i = 0; i < m; ++i) {
for (int j = 0; j < l; ++j) {
cc[i] += aa[j] * bb[i * stride + j];
}
}
return 0;
}
extern "C" int test_prologue() {
printf("%s invoked\n", __func__);
return 0;
}
extern "C" int test_epilogue() {
printf("%s invoked\n", __func__);
return 0;
}
import tvm
from tvm import te
import numpy as np
#target = "llvm -device=riscv_cpu -target=riscv64-unknown-linux-gnu -mfloat-abi=soft"
target = "llvm"
dtype = "int32"
N, M, L = 4, 32, 64
A = te.placeholder((N, L), name='A', dtype=dtype)
B = te.placeholder((M, L), name='B', dtype=dtype)
k = te.reduce_axis((0, L), name='k')
C = te.compute((N, M), lambda i, j:
te.sum(A[i, k] * B[j, k], axis=k), name='C')
s = te.create_schedule(C.op)
factor = 16
x, y = C.op.axis
z, = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)
def intrin_gemv(m, l):
"""
(M, L) * (L, 1)
"""
assert m == 16
a = te.placeholder((l,), name='a', dtype=dtype)
b = te.placeholder((m, l), name='b', dtype=dtype)
k = te.reduce_axis((0, l), name='k')
c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')
Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
name="A",
offset_factor=1,
strides=[1])
Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
name="B",
offset_factor=1,
strides=[te.var("s1"), 1])
Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
name="C",
offset_factor=1,
strides=[1])
# Emit intrinsic function
def intrin_func(ins, outs):
ib = tvm.tir.ir_builder.create()
aa, bb = ins
cc = outs[0]
ib.emit(tvm.tir.call_extern("int32", "gemv_update",
cc.access_ptr("w"),
aa.access_ptr("r"),
bb.access_ptr("r"),
m, l, bb.strides[0]))
return ib.get()
with tvm.target.build_config(offset_factor=1):
return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
gemv = intrin_gemv(factor, L)
s[C].tensorize(yi, gemv)
# Implementation for the gemv_update function - C code
def intrinsic_impls(filename):
with open(filename, "r") as f:
cc_code = f.read()
from tvm.contrib import util, clang
clang.find_clang(required=True)
temp = util.tempdir()
ll_path = temp.relpath("gemv.ll")
# Create LLVM ir from C++ source code
import os
ll_code = clang.create_llvm(cc_code, output=ll_path, options=[
"-O3",
f"-I{os.path.dirname(os.path.realpath(filename))}"
])
return ll_code
s[C].pragma(s[C].op.axis[0], "import_llvm", intrinsic_impls("gemv.cc"))
s[C].pragma(yo, "prologue", "test_prologue")
s[C].pragma(yo, "epilogue", "test_epilogue")
print(tvm.lower(s, [A, B, C], simple_mode=True))
func = tvm.build(s, [A, B, C], target=target, name="gemv")
out_llvm_ir = "gemv.ll"
with open(out_llvm_ir, "w") as f:
f.write(func.get_source())
print(f"Written LLVM IR to {out_llvm_ir}.")
from topi.util import get_const_tuple
dtype = A.dtype
ctx = tvm.context("cpu", 0)
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
print("Kernel executed and passed test.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment