Skip to content

Instantly share code, notes, and snippets.

@CasiaFan
Created November 4, 2019 09:14
Show Gist options
  • Save CasiaFan/c8a380aabdc08e1fdcf6310c78431106 to your computer and use it in GitHub Desktop.
Save CasiaFan/c8a380aabdc08e1fdcf6310c78431106 to your computer and use it in GitHub Desktop.
Acceleration inference of onnx model with TensorRT
import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import time
model_path = "model.onnx"
input_size = 32
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def build_engine(model_path):
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network() as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1<<20
builder.max_batch_size = 1
with open(model_path, "rb") as f:
parser.parse(f.read())
engine = builder.build_cuda_engine(network)
return engine
def alloc_buf(engine):
# host cpu mem
h_in_size = trt.volume(engine.get_binding_shape(0))
h_out_size = trt.volume(engine.get_binding_shape(1))
h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
# allocate gpu mem
in_gpu = cuda.mem_alloc(in_cpu.nbytes)
out_gpu = cuda.mem_alloc(out_cpu.nbytes)
stream = cuda.Stream()
return in_cpu, out_cpu, in_gpu, out_gpu, stream
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
# async version
# with engine.create_execution_context() as context: # cost time to initialize
# cuda.memcpy_htod_async(in_gpu, inputs, stream)
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
# stream.synchronize()
# sync version
cuda.memcpy_htod(in_gpu, inputs)
context.execute(1, [int(in_gpu), int(out_gpu)])
cuda.memcpy_dtoh(out_cpu, out_gpu)
return out_cpu
if __name__ == "__main__":
inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float32)
engine = build_engine(model_path)
context = engine.create_execution_context()
for _ in range(10):
t1 = time.time()
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
res = inference(engine, context, inputs.reshape(-1), out_cpu, in_gpu, out_gpu, stream)
print(res)
print("cost time: ", time.time()-t1)
# tensorrt docker image: docker pull nvcr.io/nvidia/tensorrt:19.09-py3 (See: https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt/tags)
# NOTE: cuda driver >= 418
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment