-
-
Save rmccorm4/dabccb1f31dbdcf1019a4df431067e52 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
import argparse | |
from typing import Tuple, List | |
import numpy as np | |
import pycuda.driver as cuda | |
import pycuda.autoinit | |
import tensorrt as trt | |
TRT_LOGGER = trt.Logger(trt.Logger.WARNING) | |
def is_fixed(shape: Tuple[int]): | |
return not is_dynamic(shape) | |
def is_dynamic(shape: Tuple[int]): | |
return any(dim is None or dim < 0 for dim in shape) | |
def setup_binding_shapes( | |
engine: trt.ICudaEngine, | |
context: trt.IExecutionContext, | |
host_inputs: List[np.ndarray], | |
input_binding_idxs: List[int], | |
output_binding_idxs: List[int], | |
): | |
# Explicitly set the dynamic input shapes, so the dynamic output | |
# shapes can be computed internally | |
for host_input, binding_index in zip(host_inputs, input_binding_idxs): | |
context.set_binding_shape(binding_index, host_input.shape) | |
assert context.all_binding_shapes_specified | |
host_outputs = [] | |
device_outputs = [] | |
for binding_index in output_binding_idxs: | |
output_shape = context.get_binding_shape(binding_index) | |
# Allocate buffers to hold output results after copying back to host | |
buffer = np.empty(output_shape, dtype=np.float32) | |
host_outputs.append(buffer) | |
# Allocate output buffers on device | |
device_outputs.append(cuda.mem_alloc(buffer.nbytes)) | |
return host_outputs, device_outputs | |
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int): | |
# Calculate start/end binding indices for current context's profile | |
num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles | |
start_binding = profile_index * num_bindings_per_profile | |
end_binding = start_binding + num_bindings_per_profile | |
# Separate input and output binding indices for convenience | |
input_binding_idxs = [] | |
output_binding_idxs = [] | |
for binding_index in range(start_binding, end_binding): | |
if engine.binding_is_input(binding_index): | |
input_binding_idxs.append(binding_index) | |
else: | |
output_binding_idxs.append(binding_index) | |
return input_binding_idxs, output_binding_idxs | |
def load_engine(filename: str): | |
# Load serialized engine file into memory | |
with open(filename, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: | |
return runtime.deserialize_cuda_engine(f.read()) | |
def get_random_inputs( | |
engine: trt.ICudaEngine, | |
context: trt.IExecutionContext, | |
input_binding_idxs: List[int], | |
): | |
# Input data for inference | |
host_inputs = [] | |
for binding_index in input_binding_idxs: | |
# If input shape is fixed, we'll just use it | |
input_shape = context.get_binding_shape(binding_index) | |
# If input shape is dynamic, we'll arbitrarily select one of the | |
# the min/opt/max shapes from our optimization profile | |
if is_dynamic(input_shape): | |
profile_index = context.active_optimization_profile | |
profile_shapes = engine.get_profile_shape(profile_index, binding_index) | |
# 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max) | |
input_shape = profile_shapes[1] | |
host_inputs.append(np.random.random(input_shape).astype(np.float32)) | |
return host_inputs | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-e", "--engine", required=True, type=str, help="Path to TensorRT engine file." | |
) | |
args = parser.parse_args() | |
# Load a serialized engine into memory | |
engine = load_engine(args.engine) | |
# Create context, this can be re-used | |
context = engine.create_execution_context() | |
# Profile 0 (first profile) is used by default | |
context.active_optimization_profile = 0 | |
# These binding_idxs can change if either the context or the | |
# active_optimization_profile are changed | |
input_binding_idxs, output_binding_idxs = get_binding_idxs( | |
engine, context.active_optimization_profile | |
) | |
# Generate random inputs based on profile shapes | |
host_inputs = get_random_inputs(engine, context, input_binding_idxs) | |
print("Input Shapes: {}".format([inp.shape for inp in host_inputs])) | |
# Allocate device memory for inputs. This can be easily re-used if the | |
# input shapes don't change | |
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs] | |
# Copy host inputs to device, this needs to be done for each new input | |
for h_input, d_input in zip(host_inputs, device_inputs): | |
cuda.memcpy_htod(d_input, h_input) | |
# This needs to be called everytime your input shapes change | |
# If your inputs are always the same shape (same batch size, etc.), | |
# then you will only need to call this once | |
host_outputs, device_outputs = setup_binding_shapes( | |
engine, context, host_inputs, input_binding_idxs, output_binding_idxs, | |
) | |
print("Output Shapes: {}".format([out.shape for out in host_outputs])) | |
# Bindings are a list of device pointers for inputs and outputs | |
bindings = device_inputs + device_outputs | |
# Inference | |
context.execute_v2(bindings) | |
# Copy outputs back to host to view results | |
for h_output, d_output in zip(host_outputs, device_outputs): | |
cuda.memcpy_dtoh(h_output, d_output) | |
# View outputs | |
print(host_outputs) | |
# Cleanup (Can also use context managers instead) | |
del context | |
del engine | |
if __name__ == "__main__": | |
main() |
Hi @TerryBryant,
[TensorRT] ERROR: Profile 0 has been chosen by another IExecutionContext. Use another profileIndex or destroy the IExecutionContext that use this profile.
You're likely using the same profile index on each thread, which isn't currently allowed.
At engine building time (single thread), you'll need to create at least as many optimization profiles as the number of threads you expect to be running simultaneously.
At runtime, for each thread you will likely need to do something like:
- Create a new execution context
- Assign the execution context an optimization profile that's not currently in use by another thread
For example, let's say you want to run 4 threads.
# --- Build time --- # # Create 4 opt profiles ... profile0 = builder.create_optimization_profile() profile0.set_shape(...) builder_config.add_optimization_profile(profile0) # profile_index=0 profile1 = builder.create_optimization_profile() profile1.set_shape(...) builder_config.add_optimization_profile(profile1) # profile_index=1 profile2 = builder.create_optimization_profile() profile2.set_shape(...) builder_config.add_optimization_profile(profile2) # profile_index=2 profile3 = builder.create_optimization_profile() profile3.set_shape(...) builder_config.add_optimization_profile(profile3) # profile_index=3 ... engine = builder.build_engine(network, builder_config)
# --- Inference time --- # Create an execution context with a unique optimization profile for each thread # thread 0 context0 = engine.create_execution_context() context0.active_optimization_profile = 0 # thread 1 context1 = engine.create_execution_context() context1.active_optimization_profile = 1 # thread 2 context2 = engine.create_execution_context() context2.active_optimization_profile = 2 # thread 3 context3 = engine.create_execution_context() context3.active_optimization_profile = 3
I've tried this solution, two problems occured:
1, the serialized engine file becomes very huge, due to I add 10 profiles
2, this kind of log appears, but I can still run the inference, don't know whether it's a warning message, it goes wrong
[TensorRT] WARNING: Total space of persistent layer space is 524160 on host and 3773253120 on device
[TensorRT] ERROR: ../rtSafe/safeRuntime.cpp (25) - Cuda Error in allocate: 2 (out of memory)
[TensorRT] ERROR: FAILED_ALLOCATION: std::exception
So I think it's a temporary solution, hope you and your official tensorrt team can take this multi thread problem into consideration.
Ok, I got it. Thank you.