penut85420/launch.tgi.py

## launch.tgi.py
import os
import subprocess as sp


def launch():
    container_name = "HelloTGI"
    local_model = True  # Model 是否已下載到本機內
    download_dir = "Models"  # 如果不是 Local Model 要下載到哪裡
    model_path = "Models/Llama-2-7b-chat-fp16"
    image_name = "ghcr.io/huggingface/text-generation-inference:latest"

    quantize = "bitsandbytes-nf4"  # gptq, bitsandbytes
    quantize = None
    port = 8080
    gpu_device = 0

    batch_size = 8  # 同時推論數量
    max_input_length = 1500  # 最大輸入長度
    max_new_tokens = 500  # 最大輸出長度

    max_total_tokens = max_input_length + max_new_tokens
    max_batch_prefill_tokens = max_input_length * batch_size

    cwd_dir = os.getcwd()
    if local_model:
        host_model_dir = os.path.join(cwd_dir, model_path)
        guest_model_dir = os.path.join("/", model_path)
    else:
        host_model_dir = os.path.join(cwd_dir, download_dir)
        guest_model_dir = "/data"

    max_best_of = 1
    num_shard = 1
    max_concurrent_requests = 128
    cuda_memory_fraction = 1.0

    # Docker Arguments
    cmds = list()
    cmds += ["docker", "run", "--rm", "--shm-size", "1g"]
    cmds += ["-p", f"{port}:80", "--name", container_name]
    cmds += ["--gpus", f"device={gpu_device}"]
    cmds += ["-v", f"{host_model_dir}:{guest_model_dir}"]
    cmds += [image_name]

    # TGI Arguments
    if local_model:
        cmds += ["--model-id", guest_model_dir]
    else:
        cmds += ["--model-id", model_path]
    cmds += ["--num-shard", num_shard]
    cmds += ["--quantize", quantize] if quantize else []
    cmds += ["--max-input-length", max_input_length]
    cmds += ["--max-total-tokens", max_total_tokens]
    cmds += ["--max-batch-prefill-tokens", max_batch_prefill_tokens]
    cmds += ["--max-concurrent-requests", max_concurrent_requests]
    cmds += ["--max-best-of", max_best_of]
    cmds += ["--cuda-memory-fraction", cuda_memory_fraction]
    cmds += ["--trust-remote-code"]
    cmds += ["--hostname", "0.0.0.0"]

    cmds = map(str, cmds)

    try:
        proc = sp.Popen(cmds)
        proc.wait()
        exit(proc.returncode)
    except KeyboardInterrupt:
        proc.kill()


if __name__ == "__main__":
    launch()
	import os
	import subprocess as sp


	def launch():
	container_name = "HelloTGI"
	local_model = True # Model 是否已下載到本機內
	download_dir = "Models" # 如果不是 Local Model 要下載到哪裡
	model_path = "Models/Llama-2-7b-chat-fp16"
	image_name = "ghcr.io/huggingface/text-generation-inference:latest"

	quantize = "bitsandbytes-nf4" # gptq, bitsandbytes
	quantize = None
	port = 8080
	gpu_device = 0

	batch_size = 8 # 同時推論數量
	max_input_length = 1500 # 最大輸入長度
	max_new_tokens = 500 # 最大輸出長度

	max_total_tokens = max_input_length + max_new_tokens
	max_batch_prefill_tokens = max_input_length * batch_size

	cwd_dir = os.getcwd()
	if local_model:
	host_model_dir = os.path.join(cwd_dir, model_path)
	guest_model_dir = os.path.join("/", model_path)
	else:
	host_model_dir = os.path.join(cwd_dir, download_dir)
	guest_model_dir = "/data"

	max_best_of = 1
	num_shard = 1
	max_concurrent_requests = 128
	cuda_memory_fraction = 1.0

	# Docker Arguments
	cmds = list()
	cmds += ["docker", "run", "--rm", "--shm-size", "1g"]
	cmds += ["-p", f"{port}:80", "--name", container_name]
	cmds += ["--gpus", f"device={gpu_device}"]
	cmds += ["-v", f"{host_model_dir}:{guest_model_dir}"]
	cmds += [image_name]

	# TGI Arguments
	if local_model:
	cmds += ["--model-id", guest_model_dir]
	else:
	cmds += ["--model-id", model_path]
	cmds += ["--num-shard", num_shard]
	cmds += ["--quantize", quantize] if quantize else []
	cmds += ["--max-input-length", max_input_length]
	cmds += ["--max-total-tokens", max_total_tokens]
	cmds += ["--max-batch-prefill-tokens", max_batch_prefill_tokens]
	cmds += ["--max-concurrent-requests", max_concurrent_requests]
	cmds += ["--max-best-of", max_best_of]
	cmds += ["--cuda-memory-fraction", cuda_memory_fraction]
	cmds += ["--trust-remote-code"]
	cmds += ["--hostname", "0.0.0.0"]

	cmds = map(str, cmds)

	try:
	proc = sp.Popen(cmds)
	proc.wait()
	exit(proc.returncode)
	except KeyboardInterrupt:
	proc.kill()


	if __name__ == "__main__":
	launch()