Skip to content

Instantly share code, notes, and snippets.

@charlesfrye
Created May 9, 2024 19:06
Show Gist options
  • Save charlesfrye/bfe37299e5c58893e10e63f7b7251588 to your computer and use it in GitHub Desktop.
Save charlesfrye/bfe37299e5c58893e10e63f7b7251588 to your computer and use it in GitHub Desktop.
LLaMA 3 VLLM Engine in OpenAI-Compatible Mode
import os
import subprocess
import modal
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_DIR = f"/models/{MODEL_NAME}"
N_GPU = 1
MINUTES = 60
def download_model_to_image(model_dir, model_name):
from huggingface_hub import snapshot_download
from transformers.utils import move_cache
os.makedirs(model_dir, exist_ok=True)
snapshot_download(
model_name,
local_dir=model_dir,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
token=os.environ["HF_TOKEN"],
)
move_cache()
vllm_image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
[
"vllm==0.4.1",
"hf-transfer==0.1.6",
"huggingface_hub==0.22.2",
"fastapi",
"httpx",
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model_to_image,
timeout=60 * 20,
kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME},
secrets=[modal.Secret.from_name("huggingface-secret")],
)
)
app = modal.App("vllm-openai-gist")
@app.function(
image=vllm_image,
gpu=modal.gpu.A10G(count=N_GPU),
container_idle_timeout=8 * MINUTES,
)
@modal.web_server(
port=8000,
startup_timeout=5 * MINUTES,
)
def serve_vllm():
command = ( # NCCL bug in container runtime: multi-GPU setups hang (but not H100s)
# f"NCCL_P2P_DISABLE=1 " +
f"python -m vllm.entrypoints.openai.api_server --model {MODEL_DIR}"
+ f" --tensor-parallel-size {N_GPU}"
+ " --max-model-len 2048"
)
print("Starting server with command:", command)
subprocess.Popen(command, shell=True)
from openai import OpenAI
class Colors:
"""ANSI color codes"""
GREEN = "\033[0;32m"
BLUE = "\033[0;34m"
BOLD = "\033[1m"
END = "\033[0m"
client = OpenAI(api_key="YourSecretToken")
WORKSPACE = "modal-labs"
assert WORKSPACE != "modal-labs", "Please set your workspace name"
client.base_url = f"https://{WORKSPACE}--vllm-openai-gist-serve-vllm-dev.modal.run/v1"
model = client.models.list().data[0]
print(
Colors.GREEN,
Colors.BOLD,
f"Requesting completion from model {model.id}",
Colors.END,
sep="",
)
stream = client.chat.completions.create(
model=model.id, # by default, same as directory name
messages=[
{
"role": "system",
"content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.",
},
{
"role": "user",
"content": "Compose a limerick about baboons and racoons.",
},
],
stream=True,
)
print(Colors.BLUE)
for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
print(Colors.END)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment