Created
July 2, 2024 01:31
-
-
Save jlewi/fd73cc2ae2ff2205fba84d5952b38723 to your computer and use it in GitHub Desktop.
Triton Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ARG TRITON_VERSION=2.46.0 | |
ARG TRITON_CONTAINER_VERSION=24.05 | |
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.05-py3-min | |
############################################################################ | |
## Production stage: Create container with just inference server executable | |
############################################################################ | |
FROM ${BASE_IMAGE} | |
ARG TRITON_VERSION | |
ARG TRITON_CONTAINER_VERSION | |
ENV TRITON_SERVER_VERSION ${TRITON_VERSION} | |
ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} | |
LABEL com.nvidia.tritonserver.version="${TRITON_SERVER_VERSION}" | |
ENV PATH /opt/tritonserver/bin:${PATH} | |
# Remove once https://github.com/openucx/ucx/pull/9148 is available | |
# in the min container. | |
ENV UCX_MEM_EVENTS no | |
ENV TF_ADJUST_HUE_FUSED 1 | |
ENV TF_ADJUST_SATURATION_FUSED 1 | |
ENV TF_ENABLE_WINOGRAD_NONFUSED 1 | |
ENV TF_AUTOTUNE_THRESHOLD 2 | |
ENV TRITON_SERVER_GPU_ENABLED 1 | |
# Create a user that can be used to run triton as | |
# non-root. Make sure that this user to given ID 1000. All server | |
# artifacts copied below are assign to this user. | |
ENV TRITON_SERVER_USER=triton-server | |
RUN userdel tensorrt-server > /dev/null 2>&1 || true \ | |
&& if ! id -u $TRITON_SERVER_USER > /dev/null 2>&1 ; then \ | |
useradd $TRITON_SERVER_USER; \ | |
fi \ | |
&& [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \ | |
&& [ `id -g $TRITON_SERVER_USER` -eq 1000 ] | |
# Ensure apt-get won't prompt for selecting options | |
ENV DEBIAN_FRONTEND=noninteractive | |
# Common dependencies. FIXME (can any of these be conditional? For | |
# example libcurl only needed for GCS?) | |
RUN apt-get update \ | |
&& apt-get install -y --no-install-recommends \ | |
clang \ | |
curl \ | |
dirmngr \ | |
git \ | |
gperf \ | |
libb64-0d \ | |
libcurl4-openssl-dev \ | |
libgoogle-perftools-dev \ | |
libjemalloc-dev \ | |
libnuma-dev \ | |
libre2-9 \ | |
software-properties-common \ | |
wget \ | |
\ | |
&& rm -rf /var/lib/apt/lists/* | |
# Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc | |
ENV TCMALLOC_RELEASE_RATE 200 | |
ENV DCGM_VERSION 3.2.6 | |
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads | |
RUN curl -o /tmp/cuda-keyring.deb \ | |
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ | |
&& apt install /tmp/cuda-keyring.deb \ | |
&& rm /tmp/cuda-keyring.deb \ | |
&& apt-get update \ | |
&& apt-get install -y datacenter-gpu-manager=1:3.2.6 | |
# Extra defensive wiring for CUDA Compat lib | |
RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \ | |
&& echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \ | |
&& ldconfig \ | |
&& rm -f ${_CUDA_COMPAT_PATH}/lib | |
# python3, python3-pip and some pip installs required for the python backend | |
RUN apt-get update \ | |
&& apt-get install -y --no-install-recommends \ | |
python3 \ | |
libarchive-dev \ | |
python3-pip \ | |
libpython3-dev \ | |
&& pip3 install --upgrade pip \ | |
&& pip3 install --upgrade \ | |
wheel \ | |
setuptools \ | |
numpy \ | |
virtualenv \ | |
&& rm -rf /var/lib/apt/lists/* | |
WORKDIR /opt/tritonserver | |
RUN rm -fr /opt/tritonserver/* | |
ENV NVIDIA_PRODUCT_NAME="Triton Server" | |
COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/ | |
ENV NVIDIA_BUILD_ID <unknown> | |
LABEL com.nvidia.build.id=<unknown> | |
LABEL com.nvidia.build.ref= | |
WORKDIR /opt | |
COPY --chown=1000:1000 build/install tritonserver | |
WORKDIR /opt/tritonserver | |
COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf . | |
# Remove TRT contents that are not needed in runtime | |
RUN apt-get update && apt-get install -y libcudnn8-dev && ldconfig | |
RUN ARCH="$(uname -i)" \ | |
&& rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \ | |
&& rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \ | |
&& rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples | |
# Install required packages for TRT-LLM models | |
RUN python3 -m pip install --upgrade pip \ | |
&& pip3 install transformers | |
# Drop the static libs | |
RUN ARCH="$(uname -i)" \ | |
&& rm -f ${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib/libnvinfer*.a \ | |
${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib/libnvonnxparser_*.a | |
# Install TensorRT-LLM | |
RUN python3 -m pip install /opt/tritonserver/backends/tensorrtllm/tensorrt_llm-*.whl -U --pre --extra-index-url https://pypi.nvidia.com \ | |
&& rm -fv /opt/tritonserver/backends/tensorrtllm/tensorrt_llm-*.whl | |
RUN find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf | |
RUN find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf | |
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment