Last active
April 16, 2024 06:07
-
-
Save agyaatcoder/d5debf3966ce4434c19aefa260e1b7e3 to your computer and use it in GitHub Desktop.
A script for huggingface text generation inference on modal labs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
from modal import Image, Secret, Stub, enter, gpu, method, web_server | |
# Constants for the model and deployment setup. | |
MODEL_DIR = "/model" | |
MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ" | |
QUANTIZATION = "awq" | |
DOCKER_IMAGE = "ghcr.io/huggingface/text-generation-inference:1.4" | |
PORT = 8000 | |
def download_model(): | |
""" | |
Downloads the model weights from Hugging Face hub using the model ID, | |
and ensures that the process uses the HF_TOKEN environment variable. | |
""" | |
subprocess.run( | |
[ | |
"text-generation-server", | |
"download-weights", | |
MODEL_ID, | |
], | |
env={ | |
**os.environ, | |
"HUGGING_FACE_HUB_TOKEN": os.environ["HF_TOKEN"], | |
}, | |
check=True, | |
) | |
# Configuration for GPU resource allocation. | |
GPU_CONFIG = gpu.A10G() | |
# Stub creation for managing the model deployment lifecycle. | |
stub = Stub("text-generation-inference-2") | |
# Configuration of the Docker image used for running the model server. | |
tgi_image = ( | |
Image.from_registry(DOCKER_IMAGE, add_python="3.10") | |
.dockerfile_commands("ENTRYPOINT []") | |
.run_function(download_model, timeout=60 * 20, secrets=[Secret.from_name("huggingface-secret")]) | |
) | |
@stub.function( | |
image=tgi_image, | |
gpu=GPU_CONFIG, | |
concurrency_limit=1, | |
) | |
@web_server(port=PORT, startup_timeout=120) | |
def run_server(): | |
""" | |
Launches the text generation model server with the specified configuration. | |
This function sets up a server listening on a specified port, | |
with quantization settings, and starts the server using a subprocess. | |
""" | |
model = MODEL_ID | |
port = PORT | |
cmd = f"text-generation-launcher --model-id {model} --hostname 0.0.0.0 --port {port} --quantize awq" | |
subprocess.Popen(cmd, shell=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment