dcbark01/tgi_api.sh

## tgi_api.sh
#!/bin/bash
#SBATCH --job-name=llm-swarm
#SBATCH --partition hopper-prod
#SBATCH --gpus={{gpus}}
#SBATCH --cpus-per-task=12
#SBATCH --mem-per-cpu=11G
#SBATCH -o slurm/logs/%x_%j.out

# See original source here:
# https://github.com/huggingface/llm-swarm/blob/main/templates/tgi_h100.template.slurm

# For HF cluster internal users: Check if /fsx directory exists
if [ -d "/fsx/.cache" ]; then
    export volume="/fsx/.cache"
else
    export volume=".cache"
fi
export model={{model}}
export revision={{revision}}

function unused_port() {
    N=${1:-1}
    comm -23 \
        <(seq "1025" "65535" | sort) \
        <(ss -Htan |
            awk '{print $4}' |
            cut -d':' -f2 |
            sort -u) |
        shuf |
        head -n "$N"
}
export PORT=$(unused_port)
if [ -z "$HUGGING_FACE_HUB_TOKEN" ]; then
    # try reading from file
    export HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)
fi
echo "Starting TGI container port $PORT"
echo "http://$(hostname -I | awk '{print $1}'):$PORT" >> {{slurm_hosts_path}}
# unset cache dirs to avoid pyxis having host env var somehow get into the container
unset HF_HUB_CACHE HF_ASSETS_CACHE HF_DATASETS_CACHE HF_MODULES_CACHE
srun --container-image='ghcr.io#huggingface/text-generation-inference' \
    --container-env=HUGGING_FACE_HUB_TOKEN,PORT \
    --container-mounts="$volume:/data" \
    --no-container-mount-home \
    --qos normal \
    /usr/local/bin/text-generation-launcher \
    --model-id $model \
    --revision $revision \
    --max-concurrent-requests 2000 \
    --max-total-tokens {{model_max_length}} \
    --max-input-length {{model_input_length}} \
    --max-batch-prefill-tokens {{model_max_length}} \

echo "End of job"
	#!/bin/bash
	#SBATCH --job-name=llm-swarm
	#SBATCH --partition hopper-prod
	#SBATCH --gpus={{gpus}}
	#SBATCH --cpus-per-task=12
	#SBATCH --mem-per-cpu=11G
	#SBATCH -o slurm/logs/%x_%j.out

	# See original source here:
	# https://github.com/huggingface/llm-swarm/blob/main/templates/tgi_h100.template.slurm

	# For HF cluster internal users: Check if /fsx directory exists
	if [ -d "/fsx/.cache" ]; then
	export volume="/fsx/.cache"
	else
	export volume=".cache"
	fi
	export model={{model}}
	export revision={{revision}}

	function unused_port() {
	N=${1:-1}
	comm -23 \
	<(seq "1025" "65535" \| sort) \
	<(ss -Htan \|
	awk '{print $4}' \|
	cut -d':' -f2 \|
	sort -u) \|
	shuf \|
	head -n "$N"
	}
	export PORT=$(unused_port)
	if [ -z "$HUGGING_FACE_HUB_TOKEN" ]; then
	# try reading from file
	export HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)
	fi
	echo "Starting TGI container port $PORT"
	echo "http://$(hostname -I \| awk '{print $1}'):$PORT" >> {{slurm_hosts_path}}
	# unset cache dirs to avoid pyxis having host env var somehow get into the container
	unset HF_HUB_CACHE HF_ASSETS_CACHE HF_DATASETS_CACHE HF_MODULES_CACHE
	srun --container-image='ghcr.io#huggingface/text-generation-inference' \
	--container-env=HUGGING_FACE_HUB_TOKEN,PORT \
	--container-mounts="$volume:/data" \
	--no-container-mount-home \
	--qos normal \
	/usr/local/bin/text-generation-launcher \
	--model-id $model \
	--revision $revision \
	--max-concurrent-requests 2000 \
	--max-total-tokens {{model_max_length}} \
	--max-input-length {{model_input_length}} \
	--max-batch-prefill-tokens {{model_max_length}} \

	echo "End of job"