mgolub2/serverless_llama.sh

## serverless_llama.sh
#!/bin/bash

# Ensuring the script is running as root
if [ "$EUID" -ne 0 ]; then
  echo "This script needs to be run as root. Exiting."
  exit 1
fi

# Step 1: Nvidia GPU setup with Podman
# Generate Nvidia CDI configuration if it doesn't already exist
if [ ! -f "/etc/cdi/nvidia.yaml" ]; then
  nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
fi

# Optional: List available Nvidia GPUs
nvidia-ctk cdi list

# Step 2: Create a new Podman container for the OpenAI API server
podman create --name llama-oai \
  -p 18080:18080 \
  --ulimit memlock=-1:-1 \
  --device nvidia.com/gpu=all \
  --security-opt=label=disable \
  -v /where/you/put/models/:/models:Z \
  ghcr.io/abetlen/llama-cpp-python:latest \
  python3 -m llama_cpp.server \
  --model /models/shiningvaliant-1.2.Q4_K_M.gguf \
  --n_gpu_layers 90 \
  --port 18080 \
  --host 0.0.0.0 \
  --n_ctx 4096

# Step 3: Generate and modify the systemd service file
podman generate systemd --new --name llama-oai | \
  sed -z 's/\[Install\]\nWantedBy=default.target//g' | \
  sed '/RequiresMountsFor=%t\/containers/a StopWhenUnneeded=yes' | \
  sed '/NotifyAccess=all/a ExecStartPost=\/usr\/bin\/timeout 30 sh -c '\''while ! curl http:\/\/127.0.0.1:18080\/v1\/models >& \/dev\/null; do sleep 1; done'\''' \
  > /etc/systemd/system/llama-oai.service

# Step 4: Create the socket file for the proxy
cat << EOF > /etc/systemd/system/llama-oai-proxy.socket
[Socket]
ListenStream=0.0.0.0:8000
[Install]
WantedBy=sockets.target
EOF

# Step 5: Create the service file for the proxy
cat << EOF > /etc/systemd/system/llama-oai-proxy.service
[Unit]
Requires=llama-oai.service
After=llama-oai.service
Requires=llama-oai-proxy.socket
After=llama-oai.socket
[Service]
ExecStart=/usr/lib/systemd/systemd-socket-proxyd --exit-idle-time=30s 127.0.0.1:18080
EOF

# Step 6: Reload, enable and start the systemd services and socket
systemctl daemon-reload
systemctl enable --now llama-oai-proxy.socket

# Step 7: Validate the setup
# Check that the socket is open
netstat -ltpn | grep 8000

# Optional: Test the OpenAI API
curl http://localhost.local:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "shiningvaliant-1.2.Q4_K_M.gguf", "prompt": "What is the highest resolution Phase One camera?", "temperature": 0.1, "max_tokens": 128}'

# End of Script
	#!/bin/bash

	# Ensuring the script is running as root
	if [ "$EUID" -ne 0 ]; then
	echo "This script needs to be run as root. Exiting."
	exit 1
	fi

	# Step 1: Nvidia GPU setup with Podman
	# Generate Nvidia CDI configuration if it doesn't already exist
	if [ ! -f "/etc/cdi/nvidia.yaml" ]; then
	nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
	fi

	# Optional: List available Nvidia GPUs
	nvidia-ctk cdi list

	# Step 2: Create a new Podman container for the OpenAI API server
	podman create --name llama-oai \
	-p 18080:18080 \
	--ulimit memlock=-1:-1 \
	--device nvidia.com/gpu=all \
	--security-opt=label=disable \
	-v /where/you/put/models/:/models:Z \
	ghcr.io/abetlen/llama-cpp-python:latest \
	python3 -m llama_cpp.server \
	--model /models/shiningvaliant-1.2.Q4_K_M.gguf \
	--n_gpu_layers 90 \
	--port 18080 \
	--host 0.0.0.0 \
	--n_ctx 4096

	# Step 3: Generate and modify the systemd service file
	podman generate systemd --new --name llama-oai \| \
	sed -z 's/\[Install\]\nWantedBy=default.target//g' \| \
	sed '/RequiresMountsFor=%t\/containers/a StopWhenUnneeded=yes' \| \
	sed '/NotifyAccess=all/a ExecStartPost=\/usr\/bin\/timeout 30 sh -c '\''while ! curl http:\/\/127.0.0.1:18080\/v1\/models >& \/dev\/null; do sleep 1; done'\''' \
	> /etc/systemd/system/llama-oai.service

	# Step 4: Create the socket file for the proxy
	cat << EOF > /etc/systemd/system/llama-oai-proxy.socket
	[Socket]
	ListenStream=0.0.0.0:8000
	[Install]
	WantedBy=sockets.target
	EOF

	# Step 5: Create the service file for the proxy
	cat << EOF > /etc/systemd/system/llama-oai-proxy.service
	[Unit]
	Requires=llama-oai.service
	After=llama-oai.service
	Requires=llama-oai-proxy.socket
	After=llama-oai.socket
	[Service]
	ExecStart=/usr/lib/systemd/systemd-socket-proxyd --exit-idle-time=30s 127.0.0.1:18080
	EOF

	# Step 6: Reload, enable and start the systemd services and socket
	systemctl daemon-reload
	systemctl enable --now llama-oai-proxy.socket

	# Step 7: Validate the setup
	# Check that the socket is open
	netstat -ltpn \| grep 8000

	# Optional: Test the OpenAI API
	curl http://localhost.local:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "shiningvaliant-1.2.Q4_K_M.gguf", "prompt": "What is the highest resolution Phase One camera?", "temperature": 0.1, "max_tokens": 128}'

	# End of Script