agyaatcoder/hf-tgi-modal-labs.py

## hf-tgi-modal-labs.py
import os
import subprocess

from modal import Image, Secret, Stub, enter, gpu, method, web_server

# Constants for the model and deployment setup.
MODEL_DIR = "/model"
MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
QUANTIZATION = "awq"
DOCKER_IMAGE = "ghcr.io/huggingface/text-generation-inference:1.4"
PORT = 8000


def download_model():
    """
    Downloads the model weights from Hugging Face hub using the model ID,
    and ensures that the process uses the HF_TOKEN environment variable.
    """
    subprocess.run(
        [
            "text-generation-server",
            "download-weights",
            MODEL_ID,
        ],
        env={
            **os.environ,
            "HUGGING_FACE_HUB_TOKEN": os.environ["HF_TOKEN"],
        },
        check=True,
    )

# Configuration for GPU resource allocation.
GPU_CONFIG = gpu.A10G()

# Stub creation for managing the model deployment lifecycle.
stub = Stub("text-generation-inference-2")

# Configuration of the Docker image used for running the model server.
tgi_image = (
    Image.from_registry(DOCKER_IMAGE, add_python="3.10")
    .dockerfile_commands("ENTRYPOINT []")
    .run_function(download_model, timeout=60 * 20, secrets=[Secret.from_name("huggingface-secret")])
)

@stub.function(
    image=tgi_image,
    gpu=GPU_CONFIG,
    concurrency_limit=1,
)
@web_server(port=PORT, startup_timeout=120)
def run_server():
    """
    Launches the text generation model server with the specified configuration.
    This function sets up a server listening on a specified port,
    with quantization settings, and starts the server using a subprocess.
    """
    model = MODEL_ID
    port = PORT
    cmd = f"text-generation-launcher --model-id {model} --hostname 0.0.0.0 --port {port} --quantize awq"
    subprocess.Popen(cmd, shell=True)
	import os
	import subprocess

	from modal import Image, Secret, Stub, enter, gpu, method, web_server

	# Constants for the model and deployment setup.
	MODEL_DIR = "/model"
	MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
	QUANTIZATION = "awq"
	DOCKER_IMAGE = "ghcr.io/huggingface/text-generation-inference:1.4"
	PORT = 8000


	def download_model():
	"""
	Downloads the model weights from Hugging Face hub using the model ID,
	and ensures that the process uses the HF_TOKEN environment variable.
	"""
	subprocess.run(
	[
	"text-generation-server",
	"download-weights",
	MODEL_ID,
	],
	env={
	**os.environ,
	"HUGGING_FACE_HUB_TOKEN": os.environ["HF_TOKEN"],
	},
	check=True,
	)

	# Configuration for GPU resource allocation.
	GPU_CONFIG = gpu.A10G()

	# Stub creation for managing the model deployment lifecycle.
	stub = Stub("text-generation-inference-2")

	# Configuration of the Docker image used for running the model server.
	tgi_image = (
	Image.from_registry(DOCKER_IMAGE, add_python="3.10")
	.dockerfile_commands("ENTRYPOINT []")
	.run_function(download_model, timeout=60 * 20, secrets=[Secret.from_name("huggingface-secret")])
	)

	@stub.function(
	image=tgi_image,
	gpu=GPU_CONFIG,
	concurrency_limit=1,
	)
	@web_server(port=PORT, startup_timeout=120)
	def run_server():
	"""
	Launches the text generation model server with the specified configuration.
	This function sets up a server listening on a specified port,
	with quantization settings, and starts the server using a subprocess.
	"""
	model = MODEL_ID
	port = PORT
	cmd = f"text-generation-launcher --model-id {model} --hostname 0.0.0.0 --port {port} --quantize awq"
	subprocess.Popen(cmd, shell=True)