Skip to content

Instantly share code, notes, and snippets.

@agyaatcoder
Created April 24, 2024 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agyaatcoder/edec321ea68b7b4710be9a3fa0f44d44 to your computer and use it in GitHub Desktop.
Save agyaatcoder/edec321ea68b7b4710be9a3fa0f44d44 to your computer and use it in GitHub Desktop.
LLM inference on modal labs through vLLM engine. This gives an endpoint which is OpenAI python client compatible.
import os
import subprocess
from modal import Image, Secret, Stub, enter, gpu, method, web_server
MODEL_DIR = "/model"
BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
GPU_CONFIG = gpu.A100(memory=80, count=2)
def download_model_to_folder():
from huggingface_hub import snapshot_download
from transformers.utils import move_cache
os.makedirs(MODEL_DIR, exist_ok=True)
snapshot_download(
BASE_MODEL,
local_dir=MODEL_DIR,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
)
move_cache()
# ### Image definition
# We’ll start from a recommended Docker Hub image and install `vLLM`.
# Then we’ll use `run_function` to run the function defined above to ensure the weights of
# the model are saved within the container image.
image = (
Image.from_registry(
"nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
)
.pip_install(
"vllm==0.2.5",
"huggingface_hub==0.19.4",
"hf-transfer==0.1.4",
"torch==2.1.2",
)
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model_to_folder,
secrets=[Secret.from_name("huggingface")],
timeout=60 * 20,
)
)
stub = Stub("multi-gpu-inference", image=image)
@stub.function( allow_concurrent_inputs=100, gpu = GPU_CONFIG, container_idle_timeout =300)
@web_server(8000, startup_timeout = 600)
def my_file_server():
#python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.1
if GPU_CONFIG.count > 1:
# Patch issue from https://github.com/vllm-project/vllm/issues/1116
import ray
ray.shutdown()
ray.init(num_gpus=GPU_CONFIG.count)
subprocess.Popen("python -m vllm.entrypoints.openai.api_server --model mistralai/Mixtral-8x7B-Instruct-v0.1 --tensor-parallel-size 2 --host 0.0.0.0 --port 8000 ", shell=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment