thepycoder/triton_ensemble.sh

## triton_ensemble.sh
# Convert a Huggingface model to ONNX
docker run -it --rm --gpus all \
  -v $PWD:/project ghcr.io/els-rd/transformer-deploy:0.5.1 \
  bash -c "cd /project && \
    convert_model -m \"philschmid/MiniLM-L6-H384-uncased-sst2\" \
    --backend tensorrt onnx \
    --seq-len 16 128 128"

# This will have outputted a triton_models/ folder,
# which we can now serve using Triton
docker run -it --rm --gpus all -p8000:8000 -p8001:8001 -p8002:8002 --shm-size 256m \
  -v $PWD/triton_models:/models nvcr.io/nvidia/tritonserver:22.07-py3 \
  bash -c "pip install transformers && tritonserver --model-repository=/models"
	# Convert a Huggingface model to ONNX
	docker run -it --rm --gpus all \
	-v $PWD:/project ghcr.io/els-rd/transformer-deploy:0.5.1 \
	bash -c "cd /project && \
	convert_model -m \"philschmid/MiniLM-L6-H384-uncased-sst2\" \
	--backend tensorrt onnx \
	--seq-len 16 128 128"

	# This will have outputted a triton_models/ folder,
	# which we can now serve using Triton
	docker run -it --rm --gpus all -p8000:8000 -p8001:8001 -p8002:8002 --shm-size 256m \
	-v $PWD/triton_models:/models nvcr.io/nvidia/tritonserver:22.07-py3 \
	bash -c "pip install transformers && tritonserver --model-repository=/models"