Skip to content

Instantly share code, notes, and snippets.

@penut85420
Created October 1, 2023 00:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save penut85420/54765b2ed8a3e2e127ffdc0caf2dcaab to your computer and use it in GitHub Desktop.
Save penut85420/54765b2ed8a3e2e127ffdc0caf2dcaab to your computer and use it in GitHub Desktop.
An example of Hugging Face Text Generation Inference launch script.
import os
import subprocess as sp
def launch():
container_name = "HelloTGI"
local_model = True # Model 是否已下載到本機內
download_dir = "Models" # 如果不是 Local Model 要下載到哪裡
model_path = "Models/Llama-2-7b-chat-fp16"
image_name = "ghcr.io/huggingface/text-generation-inference:latest"
quantize = "bitsandbytes-nf4" # gptq, bitsandbytes
quantize = None
port = 8080
gpu_device = 0
batch_size = 8 # 同時推論數量
max_input_length = 1500 # 最大輸入長度
max_new_tokens = 500 # 最大輸出長度
max_total_tokens = max_input_length + max_new_tokens
max_batch_prefill_tokens = max_input_length * batch_size
cwd_dir = os.getcwd()
if local_model:
host_model_dir = os.path.join(cwd_dir, model_path)
guest_model_dir = os.path.join("/", model_path)
else:
host_model_dir = os.path.join(cwd_dir, download_dir)
guest_model_dir = "/data"
max_best_of = 1
num_shard = 1
max_concurrent_requests = 128
cuda_memory_fraction = 1.0
# Docker Arguments
cmds = list()
cmds += ["docker", "run", "--rm", "--shm-size", "1g"]
cmds += ["-p", f"{port}:80", "--name", container_name]
cmds += ["--gpus", f"device={gpu_device}"]
cmds += ["-v", f"{host_model_dir}:{guest_model_dir}"]
cmds += [image_name]
# TGI Arguments
if local_model:
cmds += ["--model-id", guest_model_dir]
else:
cmds += ["--model-id", model_path]
cmds += ["--num-shard", num_shard]
cmds += ["--quantize", quantize] if quantize else []
cmds += ["--max-input-length", max_input_length]
cmds += ["--max-total-tokens", max_total_tokens]
cmds += ["--max-batch-prefill-tokens", max_batch_prefill_tokens]
cmds += ["--max-concurrent-requests", max_concurrent_requests]
cmds += ["--max-best-of", max_best_of]
cmds += ["--cuda-memory-fraction", cuda_memory_fraction]
cmds += ["--trust-remote-code"]
cmds += ["--hostname", "0.0.0.0"]
cmds = map(str, cmds)
try:
proc = sp.Popen(cmds)
proc.wait()
exit(proc.returncode)
except KeyboardInterrupt:
proc.kill()
if __name__ == "__main__":
launch()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment