Skip to content

Instantly share code, notes, and snippets.

@Delaunay
Last active October 2, 2023 17:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Delaunay/8c866a81cd696ca4cc01df26d6849764 to your computer and use it in GitHub Desktop.
Save Delaunay/8c866a81cd696ca4cc01df26d6849764 to your computer and use it in GitHub Desktop.
Run a LLAMA2 inference server on slurm
import json
import subprocess
import random
import openai
def get_slurm_job_by_name(name):
command = ["squeue", "-h", f"--name={name}", "--format=\"%A %j %T %P %U %k %N\""]
output = subprocess.check_output(command, text=True)
jobs = []
for line in output.splitlines():
job_id, job_name, status, partition, user, comment, nodes = line.split(' ')
data = dict()
if comment != "(null)":
items = comment.split('|')
for kv in items:
try:
k, v = kv.split('=', maxsplit=1)
data[k] = v
except:
pass
jobs.append({
"job_id":job_id,
"job_name":job_name,
"status":status,
"partition":partition,
"user":user,
"comment": data,
"nodes": nodes
})
return jobs
def find_suitable_inference_server(jobs, model):
selected = []
def is_shared(job):
return job["comment"].get("shared", 'y') == 'y'
def is_running(job):
return job['status'] == "RUNNING"
def has_model(job, model):
if model is None:
return True
return job['comment']['model'] == model
def select(job):
selected.append({
"model": job['comment']["model"],
"host": job["comment"]["host"],
"port": job["comment"]["port"],
})
for job in jobs:
if is_shared(job) and is_running(job):
if has_model(job, model):
select(job)
return selected
def get_inference_server(model=None):
jobs = get_slurm_job_by_name('inference_server_SHARED.sh')
servers = find_suitable_inference_server(jobs, model)
try:
return random.choice(servers)
except IndexError:
return None
def get_endpoint(model):
server = get_inference_server(model)
return f"http://{server['host']}:{server['port']}/v1"
model = "/network/weights//llama.var/llama2//Llama-2-7b-chat-hf"
# Modify OpenAI's API key and API base to use vLLM's API server.
openai.api_key = "EMPTY"
openai.api_base = get_endpoint(model)
completion = openai.Completion.create(
model=model,
prompt="San Francisco is a"
)
print("Completion result:", completion)
#!/bin/bash
#
# Assume you have conda installed
#
# Usage:
#
#
#
# sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh Llama-2-7b-chat-hf
# sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh Llama-2-13b-chat-hf
# sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh Llama-2-70b-chat-hf
#
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --time=00:15:00
# Defaults
#SBATCH --ntasks-per-node=1
#SBATCH --mem=32G
WEIGHTS="/network/weights//llama.var/llama2/"
MODEL="$1"
# MODEL="Llama-2-7b-chat-hf"
MODEL_HF="meta-llama/$MODEL"
declare -A GPUS
GPUS["meta-llama/Llama-2-7b-hf"]=1
GPUS["meta-llama/Llama-2-7b-chat-hf"]=1
GPUS["meta-llama/Llama-2-13b-hf"]=2
GPUS["meta-llama/Llama-2-13b-chat-hf"]=2
GPUS["meta-llama/Llama-2-70b-hf"]=8
GPUS["meta-llama/Llama-2-70b-chat-hf"]=8
# Not necessary because we are using a path to the model weights
export HUGGINGFACE_HUB_CACHE="/network/weights/shared_cache/huggingface/hub"
NTASKS_PER_NODE=$SLURM_NTASKS_PER_NODE
if [ ${GPUS[$MODEL_HF]+_} ] && [ "${GPUS[$MODEL_HF]}" -eq "$NTASKS_PER_NODE" ]; then
cd $SLURM_TMPDIR
git clone https://gist.github.com/8c866a81cd696ca4cc01df26d6849764.git llama2_server
cd llama2_server
CONDA_EXEC="$(which conda)"
CONDA_BASE=$(dirname $CONDA_EXEC)
source $CONDA_BASE/../etc/profile.d/conda.sh
conda create --prefix ./env python=3.9 -y
conda activate ./env
pip install vllm
PORT=9123
HOST="$(hostname)"
NAME="$WEIGHTS/$MODEL"
scontrol update job $SLURM_JOB_ID comment="model=$NAME|host=$HOST|port=$PORT|shared=y"
python -m vllm.entrypoints.openai.api_server --host $HOST --port $PORT --model "$NAME"
# --tokenizer TOKENIZER
# --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
# --pipeline-parallel-size
# --quantization {awq,None}, -q {awq,None}
# --max-num-seqs MAX_NUM_SEQS
# --max-num-batched-tokens MAX_NUM_BATCHED_TOKENS
# --block-size {8,16,32} token block size
else
echo "Model $MODEL_HF is not known or GPU count insufficient"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment