luiscape/sd_torch_2_0.py

## sd_torch_2_0.py
# ## Basic setup
from __future__ import annotations

import io
import os
import time
from pathlib import Path

import modal

# All Modal programs need a [`Stub`](/docs/reference/modal.Stub) — an object that acts as a recipe for
# the application. Let's give it a friendly name.

stub = modal.Stub("stable-diffusion-cli")

# We will be using `typer` to create our CLI interface.

import typer

app = typer.Typer()

# ## Model dependencies
#
# Your model will be running remotely inside a container. We will be installing
# all the model dependencies in the next step. We will also be "baking the model"
# into the image by running a Python function as a part of building the image.
# This lets us start containers much faster, since all the data that's needed is
# already inside the image.

model_id = "runwayml/stable-diffusion-v1-5"
cache_path = "/vol/cache"


def download_models():
    import diffusers
    import torch

    hugging_face_token = os.environ["HUGGINGFACE_TOKEN"]

    # Download scheduler configuration. Experiment with different schedulers
    # to identify one that works best for your use-case.
    scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
        use_auth_token=hugging_face_token,
        cache_dir=cache_path,
    )
    scheduler.save_pretrained(cache_path, safe_serialization=True)

    # Downloads all other models.
    pipe = diffusers.StableDiffusionPipeline.from_pretrained(
        model_id,
        use_auth_token=hugging_face_token,
        revision="fp16",
        torch_dtype=torch.float16,
        cache_dir=cache_path,
    )
    pipe.save_pretrained(cache_path, safe_serialization=True)


image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
        "accelerate",
        "diffusers[torch]>=0.10",
        "ftfy",
        "torchvision",
        "transformers",
        "triton",
        "safetensors",
    )
    .pip_install("torch==2.0.0")
    .run_function(
        download_models,
        secrets=[modal.Secret.from_name("huggingface-secret")],
    )
)
stub.image = image

# ## Using container lifecycle methods
#

class StableDiffusion:
    def __enter__(self):
        import diffusers
        import torch
        import time

        torch.backends.cuda.matmul.allow_tf32 = True

        scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
            cache_path,
            subfolder="scheduler",
            solver_order=2,
            prediction_type="epsilon",
            thresholding=False,
            algorithm_type="dpmsolver++",
            solver_type="midpoint",
            denoise_final=True,  # important if steps are <= 10
        )
        self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
            cache_path, scheduler=scheduler
        ).to("cuda")

        t0 = time.time()
        self.pipe.unet = torch.compile(self.pipe.unet)
        print(f"compiled unet in => {time.time() - t0:3f}s")


    @stub.function(gpu="A100", concurrency_limit=1)
    def run_inference(
        self, prompt: str, steps: int = 20, batch_size: int = 4
    ) -> list[bytes]:
        import torch

        with torch.inference_mode():
            with torch.autocast("cuda"):
                images = self.pipe(
                    [prompt] * batch_size,
                    num_inference_steps=steps,
                    guidance_scale=7.0,
                ).images

        # Convert to PNG bytes
        image_output = []
        for image in images:
            with io.BytesIO() as buf:
                image.save(buf, format="PNG")
                image_output.append(buf.getvalue())
        return image_output


# This is the command we'll use to generate images. It takes a `prompt`,
# `samples` (the number of images you want to generate), `steps` which
# configures the number of inference steps the model will make, and `batch_size`
# which determines how many images to generate for a given prompt.


@stub.local_entrypoint
def entrypoint(
    prompt: str, samples: int = 5, steps: int = 10, batch_size: int = 1
):
    typer.echo(
        f"prompt => {prompt}, steps => {steps}, samples => {samples}, batch_size => {batch_size}"
    )

    dir = Path("/tmp/stable-diffusion")
    if not dir.exists():
        dir.mkdir(exist_ok=True, parents=True)

    sd = StableDiffusion()
    for i in range(samples):
        t0 = time.time()
        images = sd.run_inference.call(prompt, steps, batch_size)
        total_time = time.time() - t0
        print(
            f"Sample {i} took {total_time:.3f}s ({(total_time)/len(images):.3f}s / image)."
        )
        for j, image_bytes in enumerate(images):
            output_path = dir / f"output_{j}_{i}.png"
            print(f"Saving it to {output_path}")
            with open(output_path, "wb") as f:
                f.write(image_bytes)


# And this is our entrypoint; where the CLI is invoked. Explore CLI options
# with: `modal run stable_diffusion_cli.py --help`


# # Performance
#
# This example can generate pictures in about a second, with startup time of about 10s for the first picture.
#
# See distribution of latencies below. This data was gathered by running 500 requests in sequence (meaning only
# the first request incurs a cold start). As you can see, the 90th percentile is 1.2s and the 99th percentile is 2.30s.
#
# ![latencies](./stable_diffusion_latencies.png)
	# ## Basic setup
	from __future__ import annotations

	import io
	import os
	import time
	from pathlib import Path

	import modal

	# All Modal programs need a [`Stub`](/docs/reference/modal.Stub) — an object that acts as a recipe for
	# the application. Let's give it a friendly name.

	stub = modal.Stub("stable-diffusion-cli")

	# We will be using `typer` to create our CLI interface.

	import typer

	app = typer.Typer()

	# ## Model dependencies
	#
	# Your model will be running remotely inside a container. We will be installing
	# all the model dependencies in the next step. We will also be "baking the model"
	# into the image by running a Python function as a part of building the image.
	# This lets us start containers much faster, since all the data that's needed is
	# already inside the image.

	model_id = "runwayml/stable-diffusion-v1-5"
	cache_path = "/vol/cache"


	def download_models():
	import diffusers
	import torch

	hugging_face_token = os.environ["HUGGINGFACE_TOKEN"]

	# Download scheduler configuration. Experiment with different schedulers
	# to identify one that works best for your use-case.
	scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
	model_id,
	subfolder="scheduler",
	use_auth_token=hugging_face_token,
	cache_dir=cache_path,
	)
	scheduler.save_pretrained(cache_path, safe_serialization=True)

	# Downloads all other models.
	pipe = diffusers.StableDiffusionPipeline.from_pretrained(
	model_id,
	use_auth_token=hugging_face_token,
	revision="fp16",
	torch_dtype=torch.float16,
	cache_dir=cache_path,
	)
	pipe.save_pretrained(cache_path, safe_serialization=True)


	image = (
	modal.Image.debian_slim(python_version="3.10")
	.pip_install(
	"accelerate",
	"diffusers[torch]>=0.10",
	"ftfy",
	"torchvision",
	"transformers",
	"triton",
	"safetensors",
	)
	.pip_install("torch==2.0.0")
	.run_function(
	download_models,
	secrets=[modal.Secret.from_name("huggingface-secret")],
	)
	)
	stub.image = image

	# ## Using container lifecycle methods
	#

	class StableDiffusion:
	def __enter__(self):
	import diffusers
	import torch
	import time

	torch.backends.cuda.matmul.allow_tf32 = True

	scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
	cache_path,
	subfolder="scheduler",
	solver_order=2,
	prediction_type="epsilon",
	thresholding=False,
	algorithm_type="dpmsolver++",
	solver_type="midpoint",
	denoise_final=True, # important if steps are <= 10
	)
	self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
	cache_path, scheduler=scheduler
	).to("cuda")

	t0 = time.time()
	self.pipe.unet = torch.compile(self.pipe.unet)
	print(f"compiled unet in => {time.time() - t0:3f}s")


	@stub.function(gpu="A100", concurrency_limit=1)
	def run_inference(
	self, prompt: str, steps: int = 20, batch_size: int = 4
	) -> list[bytes]:
	import torch

	with torch.inference_mode():
	with torch.autocast("cuda"):
	images = self.pipe(
	[prompt] * batch_size,
	num_inference_steps=steps,
	guidance_scale=7.0,
	).images

	# Convert to PNG bytes
	image_output = []
	for image in images:
	with io.BytesIO() as buf:
	image.save(buf, format="PNG")
	image_output.append(buf.getvalue())
	return image_output


	# This is the command we'll use to generate images. It takes a `prompt`,
	# `samples` (the number of images you want to generate), `steps` which
	# configures the number of inference steps the model will make, and `batch_size`
	# which determines how many images to generate for a given prompt.


	@stub.local_entrypoint
	def entrypoint(
	prompt: str, samples: int = 5, steps: int = 10, batch_size: int = 1
	):
	typer.echo(
	f"prompt => {prompt}, steps => {steps}, samples => {samples}, batch_size => {batch_size}"
	)

	dir = Path("/tmp/stable-diffusion")
	if not dir.exists():
	dir.mkdir(exist_ok=True, parents=True)

	sd = StableDiffusion()
	for i in range(samples):
	t0 = time.time()
	images = sd.run_inference.call(prompt, steps, batch_size)
	total_time = time.time() - t0
	print(
	f"Sample {i} took {total_time:.3f}s ({(total_time)/len(images):.3f}s / image)."
	)
	for j, image_bytes in enumerate(images):
	output_path = dir / f"output_{j}_{i}.png"
	print(f"Saving it to {output_path}")
	with open(output_path, "wb") as f:
	f.write(image_bytes)


	# And this is our entrypoint; where the CLI is invoked. Explore CLI options
	# with: `modal run stable_diffusion_cli.py --help`


	# # Performance
	#
	# This example can generate pictures in about a second, with startup time of about 10s for the first picture.
	#
	# See distribution of latencies below. This data was gathered by running 500 requests in sequence (meaning only
	# the first request incurs a cold start). As you can see, the 90th percentile is 1.2s and the 99th percentile is 2.30s.
	#
	# ![latencies](./stable_diffusion_latencies.png)