agyaatcoder/vllm_openai_compatible_mixtral.py

## vllm_openai_compatible_mixtral.py


import os
import subprocess


from modal import Image, Secret, Stub, enter, gpu, method, web_server

MODEL_DIR = "/model"
BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
GPU_CONFIG = gpu.A100(memory=80, count=2)


def download_model_to_folder():
    from huggingface_hub import snapshot_download
    from transformers.utils import move_cache

    os.makedirs(MODEL_DIR, exist_ok=True)

    snapshot_download(
        BASE_MODEL,
        local_dir=MODEL_DIR,
        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
    )
    move_cache()


# ### Image definition
# We’ll start from a recommended Docker Hub image and install `vLLM`.
# Then we’ll use `run_function` to run the function defined above to ensure the weights of
# the model are saved within the container image.
image = (
    Image.from_registry(
        "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
    )
    .pip_install(
        "vllm==0.2.5",
        "huggingface_hub==0.19.4",
        "hf-transfer==0.1.4",
        "torch==2.1.2",
    )
    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_function(
        download_model_to_folder,
        secrets=[Secret.from_name("huggingface")],
        timeout=60 * 20,
    )
)

stub = Stub("multi-gpu-inference", image=image)


@stub.function( allow_concurrent_inputs=100, gpu = GPU_CONFIG, container_idle_timeout =300)
@web_server(8000, startup_timeout = 600)
def my_file_server():
    #python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.1
    if GPU_CONFIG.count > 1:
            # Patch issue from https://github.com/vllm-project/vllm/issues/1116
        import ray

        ray.shutdown()
        ray.init(num_gpus=GPU_CONFIG.count)
    subprocess.Popen("python -m vllm.entrypoints.openai.api_server --model mistralai/Mixtral-8x7B-Instruct-v0.1 --tensor-parallel-size 2 --host 0.0.0.0 --port 8000 ", shell=True)


	import os
	import subprocess


	from modal import Image, Secret, Stub, enter, gpu, method, web_server

	MODEL_DIR = "/model"
	BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
	GPU_CONFIG = gpu.A100(memory=80, count=2)


	def download_model_to_folder():
	from huggingface_hub import snapshot_download
	from transformers.utils import move_cache

	os.makedirs(MODEL_DIR, exist_ok=True)

	snapshot_download(
	BASE_MODEL,
	local_dir=MODEL_DIR,
	ignore_patterns=[".pt", ".bin"], # Using safetensors
	)
	move_cache()


	# ### Image definition
	# We’ll start from a recommended Docker Hub image and install `vLLM`.
	# Then we’ll use `run_function` to run the function defined above to ensure the weights of
	# the model are saved within the container image.
	image = (
	Image.from_registry(
	"nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
	)
	.pip_install(
	"vllm==0.2.5",
	"huggingface_hub==0.19.4",
	"hf-transfer==0.1.4",
	"torch==2.1.2",
	)
	# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	.run_function(
	download_model_to_folder,
	secrets=[Secret.from_name("huggingface")],
	timeout=60 * 20,
	)
	)

	stub = Stub("multi-gpu-inference", image=image)


	@stub.function( allow_concurrent_inputs=100, gpu = GPU_CONFIG, container_idle_timeout =300)
	@web_server(8000, startup_timeout = 600)
	def my_file_server():
	#python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.1
	if GPU_CONFIG.count > 1:
	# Patch issue from https://github.com/vllm-project/vllm/issues/1116
	import ray

	ray.shutdown()
	ray.init(num_gpus=GPU_CONFIG.count)
	subprocess.Popen("python -m vllm.entrypoints.openai.api_server --model mistralai/Mixtral-8x7B-Instruct-v0.1 --tensor-parallel-size 2 --host 0.0.0.0 --port 8000 ", shell=True)