Skip to content

Instantly share code, notes, and snippets.

@agyaatcoder
Last active April 18, 2024 16:54
Show Gist options
  • Save agyaatcoder/9ed324ccfa5ffeb51c92b14ce15a2a64 to your computer and use it in GitHub Desktop.
Save agyaatcoder/9ed324ccfa5ffeb51c92b14ce15a2a64 to your computer and use it in GitHub Desktop.
Script to get llama-3-8b-instruct model running on modal labs
#Meta-Llama-3-8B-Instruct is gated model and requires access on hf first to be able to successfully run this
import os
import subprocess
from modal import Image, Secret, Stub, gpu, web_server
MODEL_DIR = "/model"
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
DOCKER_IMAGE = "ghcr.io/huggingface/text-generation-inference:1.4"
PORT = 8000
LAUNCH_FLAGS = [
"--model-id",
MODEL_ID,
"--port",
"8000",
]
def download_model():
subprocess.run(
[
"text-generation-server",
"download-weights",
MODEL_ID,
],
env={
**os.environ,
"HUGGING_FACE_HUB_TOKEN": os.environ["HF_TOKEN"],
},
check=True,
)
GPU_CONFIG = gpu.A100(memory=80)
stub = Stub("llama3-8b-instruct")
tgi_image = (
Image.from_registry(DOCKER_IMAGE, add_python="3.10")
.dockerfile_commands("ENTRYPOINT []")
.run_function(download_model, timeout=60 * 20, secrets=[Secret.from_name("hf-secret-llama")])
)
@stub.function(
image=tgi_image,
gpu=GPU_CONFIG,
concurrency_limit= 10,
secrets=[Secret.from_name("hf-secret-llama")] #name of the secret on modal labs, change here to use yours
)
@web_server(port=PORT, startup_timeout=120)
def run_server():
model = MODEL_ID
port = PORT
cmd = f"text-generation-launcher --model-id {model} --hostname 0.0.0.0 --port {port}"
subprocess.Popen(cmd, shell=True)
#Once you receive your endpoint: https://xyz-modal-is-awesome.modal.run, you can consume in this way
# curl https://xyz-modal-is-awesome.modal.run/generate \
# -X POST \
# -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
# -H 'Content-Type: application/json'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment