Skip to content

Instantly share code, notes, and snippets.

@trevor-m
Created January 7, 2021 22:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trevor-m/511d17c496c7f110b9eb4715ef572496 to your computer and use it in GitHub Desktop.
Save trevor-m/511d17c496c7f110b9eb4715ef572496 to your computer and use it in GitHub Desktop.
Simple network with conv2d to demonstrate how TRT doesnt consider a fast cudnn kernel. Run with nvprof --profile-from-start-off
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
_cudart = ctypes.CDLL('libcudart.so')
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
input_shape = [100, 2048, 33, 33]
output_shape = [100, 256, 33, 33]
def get_engine():
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, builder.create_builder_config() as config:
input_tensor = network.add_input(name="input", dtype=trt.float32, shape=input_shape[1:])
conv1_w = np.random.randn(256, 2048, 3, 3).astype("float32")
conv1 = network.add_convolution(input=input_tensor, num_output_maps=256, kernel_shape=(3, 3), kernel=conv1_w, bias=None)
conv1.stride = (1, 1)
conv1.padding = (1, 1)
conv1.get_output(0).name = "output"
network.mark_output(conv1.get_output(0))
builder.max_batch_size = 100
config.max_workspace_size = 1 << 33
return builder.build_engine(network, config)
engine = get_engine()
with trt.Runtime(TRT_LOGGER) as runtime, engine.create_execution_context() as context:
h_input = cuda.pagelocked_empty(trt.volume(input_shape), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(output_shape), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
cuda.memcpy_htod(d_input, h_input)
_cudart.cudaProfilerStart()
context.execute(100, bindings=[int(d_input), int(d_output)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment