Skip to content

Instantly share code, notes, and snippets.

@HoangTienDuc
Created October 10, 2021 03:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HoangTienDuc/b15c3f45a560f8613a4f495fbe727c4b to your computer and use it in GitHub Desktop.
Save HoangTienDuc/b15c3f45a560f8613a4f495fbe727c4b to your computer and use it in GitHub Desktop.
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from decoder import CifCafDecoder
import tensorrt as trt
import cv2
import openpifpaf
import torch
TRT_LOGGER = trt.Logger()
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
out_shapes = []
input_shapes = []
out_names = []
max_batch_size = engine.max_batch_size
for binding in engine:
binding_shape = engine.get_binding_shape(binding)
#Fix -1 dimension for proper memory allocation for batch_size > 1
if binding_shape[0] == -1:
binding_shape = (1,) + binding_shape[1:]
size = trt.volume(binding_shape) * max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
input_shapes.append(engine.get_binding_shape(binding))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
#Collect original output shapes and names from engine
out_shapes.append(engine.get_binding_shape(binding))
out_names.append(binding)
return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
class TrtModel(object):
def __init__(self, model):
self.engine_file = model
self.engine = None
self.inputs = None
self.outputs = None
self.bindings = None
self.stream = None
self.context = None
self.input_shapes = None
self.out_shapes = None
self.max_batch_size = 1
self.cuda_ctx = cuda.Device(0).make_context()
if self.cuda_ctx:
self.cuda_ctx.push()
def build(self):
with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = allocate_buffers(
self.engine)
self.context = self.engine.create_execution_context()
self.context.active_optimization_profile = 0
if self.cuda_ctx:
self.cuda_ctx.pop()
def run(self, input, deflatten: bool = True, as_dict=False):
# lazy load implementation
if self.engine is None:
self.build()
if self.cuda_ctx:
self.cuda_ctx.push()
input = np.asarray(input)
batch_size = input.shape[0]
allocate_place = np.prod(input.shape)
self.inputs[0].host[:allocate_place] = input.flatten(order='C').astype(np.float32)
self.context.set_binding_shape(0, input.shape)
trt_outputs = do_inference(
self.context, bindings=self.bindings,
inputs=self.inputs, outputs=self.outputs, stream=self.stream)
if self.cuda_ctx:
self.cuda_ctx.pop()
#Reshape TRT outputs to original shape instead of flattened array
if deflatten:
trt_outputs = [torch.from_numpy(output.reshape(shape)) for output, shape in zip(trt_outputs, [(17, 5, 47, 81), (19, 9, 47, 81)])]
if as_dict:
return {name: trt_outputs[i] for i, name in enumerate(self.out_names)}
return trt_outputs
engine = TrtModel("/data/data/tensorrt/openpifpaf_resnet50_641_369_d16.trt")
# engine = TrtModel("/data/arcface_r100_v1.onnx_b1_gpu0_fp16.engine")
engine.build()
image = cv2.imread("/data/warmup.jpg")
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
img_normalized = np.zeros(image.shape)
img_normalized = cv2.normalize(image_rgb, img_normalized, 0, 255, cv2.NORM_MINMAX)
img_normalized = cv2.resize(img_normalized, (641, 369))
# image = cv2.resize(image, (112, 112))
# img_normalized = np.transpose(img_normalized, axes=(2, 0, 1))
import PIL
pil_im = PIL.Image.fromarray(img_normalized)
preprocess = None
data = openpifpaf.datasets.PilImageList([pil_im], preprocess=preprocess)
loader = torch.utils.data.DataLoader(
data, batch_size=1, shuffle=False, pin_memory=True,
collate_fn=openpifpaf.datasets.collate_images_anns_meta)
for images_batch, _, __ in loader:
np_img = images_batch.numpy()
# np_img = np.expand_dims(img_normalized, axis=0)
trt_outputs = engine.run(np_img)
decoder = CifCafDecoder()
predictions = decoder.decode(trt_outputs)
# for i, pred_object in enumerate(predictions):
# pred = pred_object.data
# pred_visible = pred[pred[:, 2] > .2]
# xs = pred_visible[:, 0]
# ys = pred_visible[:, 1]
# if len(xs) == 0 or len(ys) == 0:
# continue
# x, y, w, h = pred_object.bbox()
# print(x, y, w, h)
# # x_min = int(x)
# # x_max = int(x + w)
# # y_min = int(y)
# # y_max = int(y + h)
# # xmin = int(max(x_min - .15 * w, 0))
# # xmax = int(min(x_max + .15 * w, self.w))
# # ymin = int(max(y_min - .2 * h, 0))
# # ymax = int(min(y_max + .05 * h, self.h))
# # bbox_dict={}
img_vis = cv2.resize(image, (641, 369))
import random
for i, pred_object in enumerate(predictions):
pred = pred_object.data
pred_visible = pred[pred[:, 2] > 0]
xs = pred_visible[:, 0]
ys = pred_visible[:, 1]
if min(xs) < 0 or min(ys) < 0:
continue
color = (random.randint(60, 200), random.randint(0, 255), random.randint(0, 255))
for x,y in zip(xs,ys):
cv2.circle(img_vis,(int(x), int(y)), 2, color, -1)
decode_order=[(a,b) for (a,b,c,d) in pred_object.decoding_order]
for index, (a,b) in enumerate(decode_order):
if (a+1,b+1) in pred_object.skeleton or (b+1,a+1) in pred_object.skeleton:
x1,y1,_ = pred_object.decoding_order[index][2]
x2,y2,_ = pred_object.decoding_order[index][3]
else:
continue
cv2.line(img_vis, ( x1, y1), ( x2, y2), color, 1)
cv2.imwrite("result.jpg", img_vis)
engine.cuda_ctx.pop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment