Skip to content

Instantly share code, notes, and snippets.

@masahi
Created March 15, 2022 00:09
Show Gist options
  • Save masahi/136e86bc813754de67f35ffb86c1fedd to your computer and use it in GitHub Desktop.
Save masahi/136e86bc813754de67f35ffb86c1fedd to your computer and use it in GitHub Desktop.
import numpy as np
import tvm
from tvm import relay
import tvm.contrib.graph_executor as runtime
batch_size = 1
seq_len = 384
inputs = (
np.random.uniform(1, 100, size=(batch_size, seq_len)).astype("int64"),
np.random.uniform(1, 100, size=(batch_size, seq_len)).astype("int64"),
np.random.uniform(1, 100, size=(batch_size, seq_len)).astype("int64"),
)
with open("models/bert_base_int8.json", "r") as fi:
mod = tvm.ir.load_json(fi.read())
with open("models/bert_base_int8.params", "rb") as fi:
params = relay.load_param_dict(fi.read())
# target = "cuda -libs=cublas"
target = "cuda"
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(mod, target=target, params=params)
dev = tvm.device(str(target), 0)
module = runtime.GraphModule(lib["default"](dev))
module.set_input("input_ids", inputs[0])
module.set_input("segment_ids", inputs[1])
module.set_input("input_mask", inputs[2])
module.run()
print("Evaluate inference time cost with target %s ..." % target)
print(module.benchmark(dev, number=1, repeat=50))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment