Skip to content

Instantly share code, notes, and snippets.

@luiscape
Last active January 4, 2023 04:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luiscape/a0ebff5dc2bf23dead003f6f516bfc51 to your computer and use it in GitHub Desktop.
Save luiscape/a0ebff5dc2bf23dead003f6f516bfc51 to your computer and use it in GitHub Desktop.
StableDiffusion NGC Containers on Modal
import io
import os
import time
from pathlib import Path
import modal
import typer
stub = modal.Stub("stable-diffusion-cli")
app = typer.Typer()
model_id = "runwayml/stable-diffusion-v1-5"
cache_path = "/vol/cache"
def download_models():
import diffusers
import torch
hugging_face_token = os.environ["HUGGINGFACE_TOKEN"]
# Download scheduler configuration. Experiment with different schedulers
# to identify one that works best for your use-case.
scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
model_id, subfolder="scheduler", use_auth_token=hugging_face_token, cache_dir=cache_path
)
scheduler.save_pretrained(cache_path, safe_serialization=True)
# Downloads all other models.
pipe = diffusers.StableDiffusionPipeline.from_pretrained(
model_id, use_auth_token=hugging_face_token, revision="fp16", torch_dtype=torch.float16, cache_dir=cache_path
)
pipe.save_pretrained(cache_path, safe_serialization=True)
image = (
# Use PyTorch image from the NVIDIA NGC: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags
# NVIDIA DGC images perform better in benchmarks [1]. This is related to an issue with dynamic or static
# linking of NVIDIA deep learning libraries used by PyTorch. [2]
# [1] https://gist.github.com/rwightman/bb59f9e245162cee0e38bd66bd8cd77f
# [2] https://github.com/pytorch/pytorch/issues/50153#issuecomment-808854369
modal.Image.from_dockerhub("nvcr.io/nvidia/pytorch:22.12-py3")
.run_commands(
"apt-get update",
"DEBIAN_FRONTEND=noninteractive apt-get install software-properties-common -y",
"add-apt-repository ppa:deadsnakes/ppa",
"apt update",
"apt install python3.9 python3.9-distutils -y",
"rm /usr/bin/python && ln -s /usr/bin/python3.9 /usr/bin/python",
"python -V",
"curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py",
"python get-pip.py"
)
.pip_install(
"accelerate",
"diffusers[torch]>=0.10",
"ftfy",
"torch",
"torchvision",
"transformers",
"triton",
"safetensors",
"xformers==0.0.16rc393",
"tensorboard==2.11.0",
"modal-client"
)
.run_function(
download_models,
secrets=[modal.Secret.from_name("huggingface-secret")],
)
)
stub.image = image
class StableDiffusion:
def __enter__(self):
start = time.time()
import diffusers
import torch
print(f"imports => {time.time() - start:.3f}s")
torch.backends.cuda.matmul.allow_tf32 = True
time_load = time.time()
scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
cache_path,
subfolder="scheduler",
solver_order=2,
prediction_type="epsilon",
thresholding=False,
algorithm_type="dpmsolver++",
solver_type="midpoint",
denoise_final=True, # important if steps are <= 10
)
self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(cache_path, scheduler=scheduler)
print(f"load pipe => {time.time() - time_load:.3f}s")
time_to_cuda = time.time()
self.pipe.to("cuda")
self.pipe.enable_xformers_memory_efficient_attention()
print(f"to cuda => {time.time() - time_to_cuda:.3f}s")
print(f"init total => {time.time() - start:.3f}s")
@stub.function(gpu="A10G")
def run_inference(self, prompt: str, steps: int = 20, batch_size: int = 4):
import torch
start = time.time()
with torch.inference_mode():
with torch.autocast("cuda"):
images = self.pipe([prompt] * batch_size, num_inference_steps=steps, guidance_scale=7.0).images
print(f"inference => {time.time() - start:.3f}s")
# Convert to PNG bytes
image_output = []
for image in images:
with io.BytesIO() as buf:
image.save(buf, format="PNG")
image_output.append(buf.getvalue())
return image_output
@app.command()
def entrypoint(prompt: str, samples: int = 5, steps: int = 10, batch_size: int = 1):
typer.echo(f"prompt => {prompt}, steps => {steps}, samples => {samples}, batch_size => {batch_size}")
dir = Path("/tmp/stable-diffusion")
if not dir.exists():
dir.mkdir(exist_ok=True, parents=True)
with stub.run():
sd = StableDiffusion()
for i in range(samples):
t0 = time.time()
images = sd.run_inference.call(prompt, steps, batch_size)
total_time = time.time() - t0
print(f"Sample {i} took {total_time:.3f}s ({(total_time)/len(images):.3f}s / image).")
for j, image_bytes in enumerate(images):
output_path = dir / f"output_{j}_{i}.png"
print(f"Saving it to {output_path}")
with open(output_path, "wb") as f:
f.write(image_bytes)
if __name__ == "__main__":
app()
@luiscape
Copy link
Author

luiscape commented Jan 4, 2023

After caches have been populated on GPU nodes, you can run models with cold boot with total of ~12s for the first generation:

imports => 2.372s
load pipe => 0.798s
to cuda => 3.544s
init total => 6.714s
100%|██████████| 10/10 [00:01<00:00,  7.56it/s]
inference => 2.189s
Sample 0 took 11.349s (11.349s / image).
Saving it to /tmp/stable-diffusion/output_0_0.png
100%|██████████| 10/10 [00:00<00:00, 15.63it/s]
inference => 0.786s
Sample 1 took 1.007s (1.007s / image).
Saving it to /tmp/stable-diffusion/output_0_1.png

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment