Jonpro03/Dockerfile.cuda

## README.txt
Updated to serge 0.9.0 and cuda 12.4.1
Ready-to-run images available here: https://hub.docker.com/r/jonpro03/cuda_serge


Tested with a Quadro P2000 on Ubuntu 22.04
Install Docker
Install Nvidia drivers 535
Install Cuda 12.4.1
Install nvidia-container-toolkit (follow instructions for WSL2 if using Windows)
  https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#setting-up-nvidia-container-toolkit

Launch with --gpus all flag, like:
docker run --gpus all -v weights:/usr/src/app/weights -v datadb:/data/db/ -p 8008:8008 jonpro03/cuda_serge:latest

The dockerfile is configured to be fairly backwards compatibile with older systems. You might test turning F16C/FMA/AVX2 back on and seeing if it compiles on your system. This is the _compat tag.

If it's working, you should see something similar in the container stdout
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Quadro P2000, compute capability 6.1

## Dockerfile.cuda
# ---------------------------------------
# Base image for redis
FROM redis:7-bookworm as redis

# ---------------------------------------
# Build frontend
FROM node:20-bookworm-slim as frontend

WORKDIR /usr/src/app
COPY ./web/package.json ./web/package-lock.json ./
RUN npm ci

COPY ./web /usr/src/app/web/
WORKDIR /usr/src/app/web/
RUN npm run build

# ---------------------------------------
# Build environment
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 as build

# Set ENV
ENV FORCE_CMAKE=1
# optional extras might be req'd for older cpus: -DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
ENV CMAKE_ARGS="-DLLAMA_CUDA=ON"

# Install api dependencies
RUN apt-get update \
    && apt-get install -y --no-install-recommends python3 python3-pip build-essential

# Install llama-cpp-python
RUN pip wheel llama-cpp-python --wheel-dir=/usr/local/bin/wheels

# ---------------------------------------
# Runtime environment
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as release

# Set ENV
ENV NODE_ENV='production'
ENV TZ=Etc/UTC

WORKDIR /usr/src/app

# Copy artifacts
COPY --from=redis /usr/local/bin/redis-server /usr/local/bin/redis-server
COPY --from=redis /usr/local/bin/redis-cli /usr/local/bin/redis-cli
COPY --from=frontend /usr/src/app/web/build /usr/src/app/api/static/
COPY --from=build /usr/local/bin/wheels /usr/local/bin/wheels
COPY ./api /usr/src/app/api
COPY scripts/deploy.sh /usr/src/app/deploy.sh
COPY scripts/serge.env /usr/src/app/serge.env
COPY vendor/requirements.txt /usr/src/app/requirements.txt

# Install api dependencies
RUN apt-get update \
    && apt-get install -y --no-install-recommends python3 python3-pip python-is-python3 libxml2 build-essential dumb-init \
    && pip install --no-cache-dir ./api \
    && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* \
    && chmod 755 /usr/src/app/deploy.sh \
    && chmod 755 /usr/local/bin/redis-server \
    && chmod 755 /usr/local/bin/redis-cli \
    && mkdir -p /etc/redis \
    && mkdir -p /data/db \
    && mkdir -p /usr/src/app/weights \
    && echo "appendonly yes" >> /etc/redis/redis.conf \
    && echo "dir /data/db/" >> /etc/redis/redis.conf
RUN pip install --no-index --find-links=/usr/local/bin/wheels llama-cpp-python

EXPOSE 8008
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD ["/bin/bash", "-c", "/usr/src/app/deploy.sh"]
	Updated to serge 0.9.0 and cuda 12.4.1
	Ready-to-run images available here: https://hub.docker.com/r/jonpro03/cuda_serge


	Tested with a Quadro P2000 on Ubuntu 22.04
	Install Docker
	Install Nvidia drivers 535
	Install Cuda 12.4.1
	Install nvidia-container-toolkit (follow instructions for WSL2 if using Windows)
	https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#setting-up-nvidia-container-toolkit

	Launch with --gpus all flag, like:
	docker run --gpus all -v weights:/usr/src/app/weights -v datadb:/data/db/ -p 8008:8008 jonpro03/cuda_serge:latest

	The dockerfile is configured to be fairly backwards compatibile with older systems. You might test turning F16C/FMA/AVX2 back on and seeing if it compiles on your system. This is the _compat tag.

	If it's working, you should see something similar in the container stdout
	ggml_init_cublas: found 1 CUDA devices:
	Device 0: Quadro P2000, compute capability 6.1
	# ---------------------------------------
	# Base image for redis
	FROM redis:7-bookworm as redis

	# ---------------------------------------
	# Build frontend
	FROM node:20-bookworm-slim as frontend

	WORKDIR /usr/src/app
	COPY ./web/package.json ./web/package-lock.json ./
	RUN npm ci

	COPY ./web /usr/src/app/web/
	WORKDIR /usr/src/app/web/
	RUN npm run build

	# ---------------------------------------
	# Build environment
	FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 as build

	# Set ENV
	ENV FORCE_CMAKE=1
	# optional extras might be req'd for older cpus: -DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
	ENV CMAKE_ARGS="-DLLAMA_CUDA=ON"

	# Install api dependencies
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends python3 python3-pip build-essential

	# Install llama-cpp-python
	RUN pip wheel llama-cpp-python --wheel-dir=/usr/local/bin/wheels

	# ---------------------------------------
	# Runtime environment
	FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as release

	# Set ENV
	ENV NODE_ENV='production'
	ENV TZ=Etc/UTC

	WORKDIR /usr/src/app

	# Copy artifacts
	COPY --from=redis /usr/local/bin/redis-server /usr/local/bin/redis-server
	COPY --from=redis /usr/local/bin/redis-cli /usr/local/bin/redis-cli
	COPY --from=frontend /usr/src/app/web/build /usr/src/app/api/static/
	COPY --from=build /usr/local/bin/wheels /usr/local/bin/wheels
	COPY ./api /usr/src/app/api
	COPY scripts/deploy.sh /usr/src/app/deploy.sh
	COPY scripts/serge.env /usr/src/app/serge.env
	COPY vendor/requirements.txt /usr/src/app/requirements.txt

	# Install api dependencies
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends python3 python3-pip python-is-python3 libxml2 build-essential dumb-init \
	&& pip install --no-cache-dir ./api \
	&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* \
	&& chmod 755 /usr/src/app/deploy.sh \
	&& chmod 755 /usr/local/bin/redis-server \
	&& chmod 755 /usr/local/bin/redis-cli \
	&& mkdir -p /etc/redis \
	&& mkdir -p /data/db \
	&& mkdir -p /usr/src/app/weights \
	&& echo "appendonly yes" >> /etc/redis/redis.conf \
	&& echo "dir /data/db/" >> /etc/redis/redis.conf
	RUN pip install --no-index --find-links=/usr/local/bin/wheels llama-cpp-python

	EXPOSE 8008
	ENTRYPOINT ["/usr/bin/dumb-init", "--"]
	CMD ["/bin/bash", "-c", "/usr/src/app/deploy.sh"]