Last active
October 12, 2022 06:28
-
-
Save rmccorm4/dabccb1f31dbdcf1019a4df431067e52 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
from typing import Tuple, List | |
import numpy as np | |
import pycuda.driver as cuda | |
import pycuda.autoinit | |
import tensorrt as trt | |
TRT_LOGGER = trt.Logger(trt.Logger.WARNING) | |
def is_fixed(shape: Tuple[int]): | |
return not is_dynamic(shape) | |
def is_dynamic(shape: Tuple[int]): | |
return any(dim is None or dim < 0 for dim in shape) | |
def setup_binding_shapes( | |
engine: trt.ICudaEngine, | |
context: trt.IExecutionContext, | |
host_inputs: List[np.ndarray], | |
input_binding_idxs: List[int], | |
output_binding_idxs: List[int], | |
): | |
# Explicitly set the dynamic input shapes, so the dynamic output | |
# shapes can be computed internally | |
for host_input, binding_index in zip(host_inputs, input_binding_idxs): | |
context.set_binding_shape(binding_index, host_input.shape) | |
assert context.all_binding_shapes_specified | |
host_outputs = [] | |
device_outputs = [] | |
for binding_index in output_binding_idxs: | |
output_shape = context.get_binding_shape(binding_index) | |
# Allocate buffers to hold output results after copying back to host | |
buffer = np.empty(output_shape, dtype=np.float32) | |
host_outputs.append(buffer) | |
# Allocate output buffers on device | |
device_outputs.append(cuda.mem_alloc(buffer.nbytes)) | |
return host_outputs, device_outputs | |
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int): | |
# Calculate start/end binding indices for current context's profile | |
num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles | |
start_binding = profile_index * num_bindings_per_profile | |
end_binding = start_binding + num_bindings_per_profile | |
# Separate input and output binding indices for convenience | |
input_binding_idxs = [] | |
output_binding_idxs = [] | |
for binding_index in range(start_binding, end_binding): | |
if engine.binding_is_input(binding_index): | |
input_binding_idxs.append(binding_index) | |
else: | |
output_binding_idxs.append(binding_index) | |
return input_binding_idxs, output_binding_idxs | |
def load_engine(filename: str): | |
# Load serialized engine file into memory | |
with open(filename, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: | |
return runtime.deserialize_cuda_engine(f.read()) | |
def get_random_inputs( | |
engine: trt.ICudaEngine, | |
context: trt.IExecutionContext, | |
input_binding_idxs: List[int], | |
): | |
# Input data for inference | |
host_inputs = [] | |
for binding_index in input_binding_idxs: | |
# If input shape is fixed, we'll just use it | |
input_shape = context.get_binding_shape(binding_index) | |
# If input shape is dynamic, we'll arbitrarily select one of the | |
# the min/opt/max shapes from our optimization profile | |
if is_dynamic(input_shape): | |
profile_index = context.active_optimization_profile | |
profile_shapes = engine.get_profile_shape(profile_index, binding_index) | |
# 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max) | |
input_shape = profile_shapes[1] | |
host_inputs.append(np.random.random(input_shape).astype(np.float32)) | |
return host_inputs | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-e", "--engine", required=True, type=str, help="Path to TensorRT engine file." | |
) | |
args = parser.parse_args() | |
# Load a serialized engine into memory | |
engine = load_engine(args.engine) | |
# Create context, this can be re-used | |
context = engine.create_execution_context() | |
# Profile 0 (first profile) is used by default | |
context.active_optimization_profile = 0 | |
# These binding_idxs can change if either the context or the | |
# active_optimization_profile are changed | |
input_binding_idxs, output_binding_idxs = get_binding_idxs( | |
engine, context.active_optimization_profile | |
) | |
# Generate random inputs based on profile shapes | |
host_inputs = get_random_inputs(engine, context, input_binding_idxs) | |
print("Input Shapes: {}".format([inp.shape for inp in host_inputs])) | |
# Allocate device memory for inputs. This can be easily re-used if the | |
# input shapes don't change | |
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs] | |
# Copy host inputs to device, this needs to be done for each new input | |
for h_input, d_input in zip(host_inputs, device_inputs): | |
cuda.memcpy_htod(d_input, h_input) | |
# This needs to be called everytime your input shapes change | |
# If your inputs are always the same shape (same batch size, etc.), | |
# then you will only need to call this once | |
host_outputs, device_outputs = setup_binding_shapes( | |
engine, context, host_inputs, input_binding_idxs, output_binding_idxs, | |
) | |
print("Output Shapes: {}".format([out.shape for out in host_outputs])) | |
# Bindings are a list of device pointers for inputs and outputs | |
bindings = device_inputs + device_outputs | |
# Inference | |
context.execute_v2(bindings) | |
# Copy outputs back to host to view results | |
for h_output, d_output in zip(host_outputs, device_outputs): | |
cuda.memcpy_dtoh(h_output, d_output) | |
# View outputs | |
print(host_outputs) | |
# Cleanup (Can also use context managers instead) | |
del context | |
del engine | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've tried this solution, two problems occured:
1, the serialized engine file becomes very huge, due to I add 10 profiles
2, this kind of log appears,
but I can still run the inference, don't know whether it's a warning message, it goes wrongSo I think it's a temporary solution, hope you and your official tensorrt team can take this multi thread problem into consideration.