Created
October 27, 2023 17:13
-
-
Save mgolub2/4f255aba09e186945bc093d6189ae497 to your computer and use it in GitHub Desktop.
A quick script to setup GPU accelerated serverless llama.cpp using Podman and systemd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Ensuring the script is running as root | |
if [ "$EUID" -ne 0 ]; then | |
echo "This script needs to be run as root. Exiting." | |
exit 1 | |
fi | |
# Step 1: Nvidia GPU setup with Podman | |
# Generate Nvidia CDI configuration if it doesn't already exist | |
if [ ! -f "/etc/cdi/nvidia.yaml" ]; then | |
nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml | |
fi | |
# Optional: List available Nvidia GPUs | |
nvidia-ctk cdi list | |
# Step 2: Create a new Podman container for the OpenAI API server | |
podman create --name llama-oai \ | |
-p 18080:18080 \ | |
--ulimit memlock=-1:-1 \ | |
--device nvidia.com/gpu=all \ | |
--security-opt=label=disable \ | |
-v /where/you/put/models/:/models:Z \ | |
ghcr.io/abetlen/llama-cpp-python:latest \ | |
python3 -m llama_cpp.server \ | |
--model /models/shiningvaliant-1.2.Q4_K_M.gguf \ | |
--n_gpu_layers 90 \ | |
--port 18080 \ | |
--host 0.0.0.0 \ | |
--n_ctx 4096 | |
# Step 3: Generate and modify the systemd service file | |
podman generate systemd --new --name llama-oai | \ | |
sed -z 's/\[Install\]\nWantedBy=default.target//g' | \ | |
sed '/RequiresMountsFor=%t\/containers/a StopWhenUnneeded=yes' | \ | |
sed '/NotifyAccess=all/a ExecStartPost=\/usr\/bin\/timeout 30 sh -c '\''while ! curl http:\/\/127.0.0.1:18080\/v1\/models >& \/dev\/null; do sleep 1; done'\''' \ | |
> /etc/systemd/system/llama-oai.service | |
# Step 4: Create the socket file for the proxy | |
cat << EOF > /etc/systemd/system/llama-oai-proxy.socket | |
[Socket] | |
ListenStream=0.0.0.0:8000 | |
[Install] | |
WantedBy=sockets.target | |
EOF | |
# Step 5: Create the service file for the proxy | |
cat << EOF > /etc/systemd/system/llama-oai-proxy.service | |
[Unit] | |
Requires=llama-oai.service | |
After=llama-oai.service | |
Requires=llama-oai-proxy.socket | |
After=llama-oai.socket | |
[Service] | |
ExecStart=/usr/lib/systemd/systemd-socket-proxyd --exit-idle-time=30s 127.0.0.1:18080 | |
EOF | |
# Step 6: Reload, enable and start the systemd services and socket | |
systemctl daemon-reload | |
systemctl enable --now llama-oai-proxy.socket | |
# Step 7: Validate the setup | |
# Check that the socket is open | |
netstat -ltpn | grep 8000 | |
# Optional: Test the OpenAI API | |
curl http://localhost.local:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "shiningvaliant-1.2.Q4_K_M.gguf", "prompt": "What is the highest resolution Phase One camera?", "temperature": 0.1, "max_tokens": 128}' | |
# End of Script |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Right now, this is a very basic script with hardcoded paths and models.
Change what you need and have fun!