flexchar/bloom-chat-q4.py

## bloom-chat-q4.py
import time
import uuid
from modal import Image, method, Stub, web_endpoint, gpu


# This example showcases running BLOOM Chat 176B 4bit quant on 5 serverless GPUs.
# It is possible to run original BLOOM model as well.
# Big kudos to TheBloke for creating the quants.
# https://huggingface.co/TheBloke/bloomz-176B-GPTQ
# https://huggingface.co/TheBloke/BLOOMChat-176B-v1-GPTQ

stub = Stub(name="bloom-chat-v1-q4") # change this to your own name

IMAGE_MODEL_DIR = "/model"
MODEL_BASE_FILE = "gptq_model-4bit--1g" # the model file name without the ".safetensors" suffix

# Bloom models were split to adhere to Hugging Face 50GB limit therefore after downloading, we will merge the model files into a single file. This command works with both versions.
SPLIT_FILE_REGEX = "gptq_model-4bit--1g.JOINBEFOREUSE.split-*.safetensors"
command = f"cd {IMAGE_MODEL_DIR} && cat {SPLIT_FILE_REGEX} > {MODEL_BASE_FILE}.safetensors && rm {SPLIT_FILE_REGEX}"


# Here we declare a function to download the model during the build time.
def download_model():
    import transformers
    from huggingface_hub import snapshot_download
    MODEL_NAME = "TheBloke/BLOOMChat-176B-v1-GPTQ"

    # Verify at least 200GB is available.
    # This is to ensure that the model can be downloaded and merged.
    # The merged file will be deleted after the model is loaded.
    import shutil
    total, used, free = shutil.disk_usage("/")
    assert free > 200 * 1024 * 1024 * 1024, f"Expected at least 200GB free space. Got {free}"

    # The download may fail once in a while. Simply rerun the script again.
    print(f"Downloading model... expect 3-15 minutes...")
    start_time = time.time()
    snapshot_download(MODEL_NAME,
        local_dir=IMAGE_MODEL_DIR,
        resume_download=True,
        # token is optional but it will speed up the download
        token="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")
    end_time = time.time()
    print(f"Download completed, took => {end_time - start_time:.2f}s")

    print("Combining model files... expect 3 minutes...")
    import subprocess
    subprocess.run(command, check=True, shell=True)
    print(f"Model files combined, took => {time.time() - end_time:.2f}s")
    end_time = time.time()

    # We move cache to avoid doing that during inference time.
    print("Moving cache... expect 2-4 minutes...")
    transformers.utils.move_cache()
    print(f"Cache moved, took => {time.time() - end_time:.2f}s")

    print("Done! Modal may take up to 15 minutes to upload a snapshot...")


inference_image = (
    Image.from_dockerhub(
        "nvidia/cuda:11.8.0-devel-ubuntu22.04",
        setup_dockerfile_commands=[
            "RUN apt-get update",
            "RUN apt-get install -y python3 python3-pip python-is-python3",
        ],
    )
    .apt_install("git", "gcc", "build-essential")
    .run_commands(
        "pip install --compile huggingface_hub transformers torch einops hf_transfer",
    )
    .env({
            "HF_HUB_ENABLE_HF_TRANSFER": "1", # enable fast downloads, this mediates common Hugging Face Read Timeouts
            "PIP_NO_CACHE_DIR": "1",
            "PIP_DISABLE_PIP_VERSION_CHECK": "1",
            "SAFETENSORS_FAST_GPU": "1", # Load the model directly to GPU memory skipping RAM
            "BITSANDBYTES_NOWELCOME": "1",
        })
    .run_function(download_model)
    .run_commands(
        # It appears that installing directly through pip torch extension fails to compile. Therefore we clone it and install it from source whilst providing T4 GPU.
        "git clone https://github.com/PanQiWei/AutoGPTQ.git",
        "cd AutoGPTQ && pip install --compile .",
        gpu="T4",
    )
)

api_image = (
    Image.debian_slim()
)


@stub.cls(image=inference_image, gpu=gpu.A100(count=5), container_idle_timeout=300, cloud="oci", concurrency_limit=1)
class TheModel:
    def __enter__(self):
        start_import = time.time()
        import torch
        from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
        from auto_gptq import AutoGPTQForCausalLM

        print(f"importing libraries took => {time.time() - start_import:.2f}s")

        start_load_tokenizer = time.time()
        self.tokenizer = AutoTokenizer.from_pretrained(
            IMAGE_MODEL_DIR, use_fast=True
        )
        print(f"loading tokenizer took => {time.time() - start_load_tokenizer:.2f}s")

        start_loading_model = time.time()
        print("loading model...")

        self.model = AutoGPTQForCausalLM.from_quantized(
            IMAGE_MODEL_DIR,
            model_basename=MODEL_BASE_FILE,
            use_safetensors=True,
            device_map="auto",
            use_triton=False,
            strict=True,
        )
        self.model.tie_weights()

        print(f"Model loaded in =>  {time.time() - start_loading_model:.2f}s")

        cold_boot_time = time.time() - start_import
        print(f"total cold boot time  => {cold_boot_time:.2f}s")

        self.is_loaded = False
        self.cold_boot_time = cold_boot_time

    @method()
    async def generate(self, input, temperature = 0.7, max_tokens = 256, stop_words = [""]):
        import torch
        from transformers import StoppingCriteria, StoppingCriteriaList

        # Cold boot time, should be zero if model is already loaded
        if (self.is_loaded == False):
            cold_boot_time = self.cold_boot_time
            self.is_loaded = True
        else:
            cold_boot_time = 0

        stop_token_ids = self.tokenizer.convert_tokens_to_ids(stop_words)
        class StopOnTokens(StoppingCriteria):
            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
                for stop_id in stop_token_ids:
                    if input_ids[0][-1] == stop_id:
                        return True
                return False
        stopping_criteria = StoppingCriteriaList([StopOnTokens()])

        t3 = time.time()
        input_ids = self.tokenizer(input, return_tensors='pt').input_ids.cuda()
        input_tokens= len(input_ids[0])
        generation = self.model.generate(inputs=input_ids,
            temperature=temperature,
            do_sample=True if temperature > 0 else False,
            max_new_tokens=max_tokens,
            repetition_penalty=1.1,
            stopping_criteria=stopping_criteria if len(stop_words) > 0 else None,
        )
        completion_tokens = len(generation[0]) - input_tokens

        # Provide completion without the prompt
        # Subtract the input tokens from the generated tokens
        new_tokens = generation[0][input_tokens:]

        completion = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
        latency = time.time() - t3

        print(f"Input tokens: {input_tokens}")
        print(f"Completion tokens: {completion_tokens}")
        print(f"Generation took => {latency:.2f}s")

        return {
            "id": str(uuid.uuid4()),
            "completion": completion,
            "completion_tokens": completion_tokens,
            "prompt_tokens": input_tokens,
            "execution_time": latency,
            "delay_time": cold_boot_time,
            "model": stub.name,
        }


DEMO_INPUT = """
<human>: What is Modal?
<bot>: Modal (modal.com) lets you run code in the cloud without having to think about infrastructure.
Features
- Run any code remotely within seconds.
- Define container environments in code (or use one of our pre-built backends).
- Scale up horizontally to thousands of containers.
- Deploy and monitor persistent cron jobs.
- Attach GPUs with a single line of code.
- Serve your functions as web endpoints.
- Use powerful primitives like distributed dictionaries and queues.
- Run your code on a schedule.
<human>: What is the future of Modal?
<bot>:
"""

@stub.local_entrypoint()
def main():
    t0 = time.time()
    model = TheModel()
    val= model.generate.call(DEMO_INPUT)
    print(val)
    print(f"Total time: {time.time() - t0:.2f}s")


from pydantic import BaseModel
from typing_extensions import Annotated
from typing import List, Union

class CompletionRequest(BaseModel):
    prompt: Annotated[str, "The prompt for text completion"]
    temperature: Annotated[
        float,
        "Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic.",
    ] = 0.7
    max_tokens: Annotated[
        int, "Maximum number of new tokens to generate for text completion."
    ] = 16
    stop_words: Annotated[Union[str, List[str]], "Any additional stop words."] = []
    ref: Annotated[str, "Reference string for the completion"] = ""


@stub.function(image=api_image, cloud="oci", concurrency_limit=1)
@web_endpoint(method="POST")
def api(request: CompletionRequest):
    t = time.time()
    print(f"Request received: {request.ref}")
    result = TheModel().generate.call(input=request.prompt,
                temperature=request.temperature,
                max_tokens=request.max_tokens,
                stop_words=request.stop_words
            )

    result["ref"] = request.ref

    print(f"Request completed: {request.ref} => {time.time() - t:.2f}s")

    return result
	import time
	import uuid
	from modal import Image, method, Stub, web_endpoint, gpu


	# This example showcases running BLOOM Chat 176B 4bit quant on 5 serverless GPUs.
	# It is possible to run original BLOOM model as well.
	# Big kudos to TheBloke for creating the quants.
	# https://huggingface.co/TheBloke/bloomz-176B-GPTQ
	# https://huggingface.co/TheBloke/BLOOMChat-176B-v1-GPTQ

	stub = Stub(name="bloom-chat-v1-q4") # change this to your own name

	IMAGE_MODEL_DIR = "/model"
	MODEL_BASE_FILE = "gptq_model-4bit--1g" # the model file name without the ".safetensors" suffix

	# Bloom models were split to adhere to Hugging Face 50GB limit therefore after downloading, we will merge the model files into a single file. This command works with both versions.
	SPLIT_FILE_REGEX = "gptq_model-4bit--1g.JOINBEFOREUSE.split-*.safetensors"
	command = f"cd {IMAGE_MODEL_DIR} && cat {SPLIT_FILE_REGEX} > {MODEL_BASE_FILE}.safetensors && rm {SPLIT_FILE_REGEX}"


	# Here we declare a function to download the model during the build time.
	def download_model():
	import transformers
	from huggingface_hub import snapshot_download
	MODEL_NAME = "TheBloke/BLOOMChat-176B-v1-GPTQ"

	# Verify at least 200GB is available.
	# This is to ensure that the model can be downloaded and merged.
	# The merged file will be deleted after the model is loaded.
	import shutil
	total, used, free = shutil.disk_usage("/")
	assert free > 200 * 1024 * 1024 * 1024, f"Expected at least 200GB free space. Got {free}"

	# The download may fail once in a while. Simply rerun the script again.
	print(f"Downloading model... expect 3-15 minutes...")
	start_time = time.time()
	snapshot_download(MODEL_NAME,
	local_dir=IMAGE_MODEL_DIR,
	resume_download=True,
	# token is optional but it will speed up the download
	token="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")
	end_time = time.time()
	print(f"Download completed, took => {end_time - start_time:.2f}s")

	print("Combining model files... expect 3 minutes...")
	import subprocess
	subprocess.run(command, check=True, shell=True)
	print(f"Model files combined, took => {time.time() - end_time:.2f}s")
	end_time = time.time()

	# We move cache to avoid doing that during inference time.
	print("Moving cache... expect 2-4 minutes...")
	transformers.utils.move_cache()
	print(f"Cache moved, took => {time.time() - end_time:.2f}s")

	print("Done! Modal may take up to 15 minutes to upload a snapshot...")


	inference_image = (
	Image.from_dockerhub(
	"nvidia/cuda:11.8.0-devel-ubuntu22.04",
	setup_dockerfile_commands=[
	"RUN apt-get update",
	"RUN apt-get install -y python3 python3-pip python-is-python3",
	],
	)
	.apt_install("git", "gcc", "build-essential")
	.run_commands(
	"pip install --compile huggingface_hub transformers torch einops hf_transfer",
	)
	.env({
	"HF_HUB_ENABLE_HF_TRANSFER": "1", # enable fast downloads, this mediates common Hugging Face Read Timeouts
	"PIP_NO_CACHE_DIR": "1",
	"PIP_DISABLE_PIP_VERSION_CHECK": "1",
	"SAFETENSORS_FAST_GPU": "1", # Load the model directly to GPU memory skipping RAM
	"BITSANDBYTES_NOWELCOME": "1",
	})
	.run_function(download_model)
	.run_commands(
	# It appears that installing directly through pip torch extension fails to compile. Therefore we clone it and install it from source whilst providing T4 GPU.
	"git clone https://github.com/PanQiWei/AutoGPTQ.git",
	"cd AutoGPTQ && pip install --compile .",
	gpu="T4",
	)
	)

	api_image = (
	Image.debian_slim()
	)


	@stub.cls(image=inference_image, gpu=gpu.A100(count=5), container_idle_timeout=300, cloud="oci", concurrency_limit=1)
	class TheModel:
	def __enter__(self):
	start_import = time.time()
	import torch
	from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
	from auto_gptq import AutoGPTQForCausalLM

	print(f"importing libraries took => {time.time() - start_import:.2f}s")

	start_load_tokenizer = time.time()
	self.tokenizer = AutoTokenizer.from_pretrained(
	IMAGE_MODEL_DIR, use_fast=True
	)
	print(f"loading tokenizer took => {time.time() - start_load_tokenizer:.2f}s")

	start_loading_model = time.time()
	print("loading model...")

	self.model = AutoGPTQForCausalLM.from_quantized(
	IMAGE_MODEL_DIR,
	model_basename=MODEL_BASE_FILE,
	use_safetensors=True,
	device_map="auto",
	use_triton=False,
	strict=True,
	)
	self.model.tie_weights()

	print(f"Model loaded in => {time.time() - start_loading_model:.2f}s")

	cold_boot_time = time.time() - start_import
	print(f"total cold boot time => {cold_boot_time:.2f}s")

	self.is_loaded = False
	self.cold_boot_time = cold_boot_time

	@method()
	async def generate(self, input, temperature = 0.7, max_tokens = 256, stop_words = [""]):
	import torch
	from transformers import StoppingCriteria, StoppingCriteriaList

	# Cold boot time, should be zero if model is already loaded
	if (self.is_loaded == False):
	cold_boot_time = self.cold_boot_time
	self.is_loaded = True
	else:
	cold_boot_time = 0

	stop_token_ids = self.tokenizer.convert_tokens_to_ids(stop_words)
	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	for stop_id in stop_token_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False
	stopping_criteria = StoppingCriteriaList([StopOnTokens()])

	t3 = time.time()
	input_ids = self.tokenizer(input, return_tensors='pt').input_ids.cuda()
	input_tokens= len(input_ids[0])
	generation = self.model.generate(inputs=input_ids,
	temperature=temperature,
	do_sample=True if temperature > 0 else False,
	max_new_tokens=max_tokens,
	repetition_penalty=1.1,
	stopping_criteria=stopping_criteria if len(stop_words) > 0 else None,
	)
	completion_tokens = len(generation[0]) - input_tokens

	# Provide completion without the prompt
	# Subtract the input tokens from the generated tokens
	new_tokens = generation[0][input_tokens:]

	completion = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
	latency = time.time() - t3

	print(f"Input tokens: {input_tokens}")
	print(f"Completion tokens: {completion_tokens}")
	print(f"Generation took => {latency:.2f}s")

	return {
	"id": str(uuid.uuid4()),
	"completion": completion,
	"completion_tokens": completion_tokens,
	"prompt_tokens": input_tokens,
	"execution_time": latency,
	"delay_time": cold_boot_time,
	"model": stub.name,
	}



	DEMO_INPUT = """
	<human>: What is Modal?
	<bot>: Modal (modal.com) lets you run code in the cloud without having to think about infrastructure.
	Features
	- Run any code remotely within seconds.
	- Define container environments in code (or use one of our pre-built backends).
	- Scale up horizontally to thousands of containers.
	- Deploy and monitor persistent cron jobs.
	- Attach GPUs with a single line of code.
	- Serve your functions as web endpoints.
	- Use powerful primitives like distributed dictionaries and queues.
	- Run your code on a schedule.
	<human>: What is the future of Modal?
	<bot>:
	"""

	@stub.local_entrypoint()
	def main():
	t0 = time.time()
	model = TheModel()
	val= model.generate.call(DEMO_INPUT)
	print(val)
	print(f"Total time: {time.time() - t0:.2f}s")


	from pydantic import BaseModel
	from typing_extensions import Annotated
	from typing import List, Union

	class CompletionRequest(BaseModel):
	prompt: Annotated[str, "The prompt for text completion"]
	temperature: Annotated[
	float,
	"Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic.",
	] = 0.7
	max_tokens: Annotated[
	int, "Maximum number of new tokens to generate for text completion."
	] = 16
	stop_words: Annotated[Union[str, List[str]], "Any additional stop words."] = []
	ref: Annotated[str, "Reference string for the completion"] = ""


	@stub.function(image=api_image, cloud="oci", concurrency_limit=1)
	@web_endpoint(method="POST")
	def api(request: CompletionRequest):
	t = time.time()
	print(f"Request received: {request.ref}")
	result = TheModel().generate.call(input=request.prompt,
	temperature=request.temperature,
	max_tokens=request.max_tokens,
	stop_words=request.stop_words
	)

	result["ref"] = request.ref

	print(f"Request completed: {request.ref} => {time.time() - t:.2f}s")

	return result