jochemstoel/mounting_hf_model.py

## mounting_hf_model.py
import io
import time
import modal
import os
import torch
import diffusers

from pathlib import Path

stub = modal.Stub("local-model-mount-test")

model_id = "runwayml/stable-diffusion-v1-5"
hf_token = os.getenv("HUGGINGFACE_TOKEN")
local_path = "/tmp/hf-model"


image = (
    modal.Image.conda()
    .run_commands(
        [
            "conda install xformers -c xformers/label/dev",
            "conda install pytorch torchvision pytorch-cuda=11.7 -c pytorch -c nvidia",
        ]
    )
    .run_commands(["pip install diffusers[torch] transformers ftfy accelerate"])
)
stub.image = image


def download_model():
    euler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(
        model_id, subfolder="scheduler", use_auth_token=hf_token, cache_dir=local_path
    )
    euler.save_pretrained(local_path)

    pipe = diffusers.StableDiffusionPipeline.from_pretrained(
        model_id, use_auth_token=hf_token, revision="fp16", torch_dtype=torch.float16, cache_dir=local_path
    )
    pipe.save_pretrained(local_path)

class StableDiffusion:
    def __enter__(self):
        import torch
        import diffusers

        torch.backends.cudnn.benchmark = True
        torch.backends.cuda.matmul.allow_tf32 = True

        euler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(local_path, subfolder="scheduler")
        self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(local_path, scheduler=euler).to("cuda")
        self.pipe.enable_xformers_memory_efficient_attention()

    @stub.function(gpu=modal.gpu.A100(), mounts=[
        modal.Mount(local_dir=local_path, remote_dir=local_path)
    ])
    def run_inference(self, prompt: str, steps: int = 20) -> bytes:
        import torch

        with torch.inference_mode():
            image = self.pipe(prompt, num_inference_steps=steps, guidance_scale=7.0).images[0]

        # Convert to PNG bytes
        buf = io.BytesIO()
        image.save(buf, format="PNG")
        image_bytes = buf.getvalue()
        return image_bytes

def run_inference():
    samples = 10
    prompt = "An 1600s oil painting of the New York City skyline"
    dir = Path("/tmp/stable-diffusion")
    if not dir.exists():
        dir.mkdir(exist_ok=True, parents=True)

    with stub.run():
        sd = StableDiffusion()
        for i in range(samples):
            t0 = time.time()
            image_bytes = sd.run_inference.call(prompt)
            output_path = dir / f"output_{i}.png"
            print(f"Sample {i} took {time.time()-t0:.3f}s. Saving it to {output_path}")
            with open(output_path, "wb") as f:
                f.write(image_bytes)


if __name__ == "__main__":
    download_model()
    run_inference()
	import io
	import time
	import modal
	import os
	import torch
	import diffusers

	from pathlib import Path

	stub = modal.Stub("local-model-mount-test")

	model_id = "runwayml/stable-diffusion-v1-5"
	hf_token = os.getenv("HUGGINGFACE_TOKEN")
	local_path = "/tmp/hf-model"


	image = (
	modal.Image.conda()
	.run_commands(
	[
	"conda install xformers -c xformers/label/dev",
	"conda install pytorch torchvision pytorch-cuda=11.7 -c pytorch -c nvidia",
	]
	)
	.run_commands(["pip install diffusers[torch] transformers ftfy accelerate"])
	)
	stub.image = image



	def download_model():
	euler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(
	model_id, subfolder="scheduler", use_auth_token=hf_token, cache_dir=local_path
	)
	euler.save_pretrained(local_path)

	pipe = diffusers.StableDiffusionPipeline.from_pretrained(
	model_id, use_auth_token=hf_token, revision="fp16", torch_dtype=torch.float16, cache_dir=local_path
	)
	pipe.save_pretrained(local_path)

	class StableDiffusion:
	def __enter__(self):
	import torch
	import diffusers

	torch.backends.cudnn.benchmark = True
	torch.backends.cuda.matmul.allow_tf32 = True

	euler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(local_path, subfolder="scheduler")
	self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(local_path, scheduler=euler).to("cuda")
	self.pipe.enable_xformers_memory_efficient_attention()

	@stub.function(gpu=modal.gpu.A100(), mounts=[
	modal.Mount(local_dir=local_path, remote_dir=local_path)
	])
	def run_inference(self, prompt: str, steps: int = 20) -> bytes:
	import torch

	with torch.inference_mode():
	image = self.pipe(prompt, num_inference_steps=steps, guidance_scale=7.0).images[0]

	# Convert to PNG bytes
	buf = io.BytesIO()
	image.save(buf, format="PNG")
	image_bytes = buf.getvalue()
	return image_bytes

	def run_inference():
	samples = 10
	prompt = "An 1600s oil painting of the New York City skyline"
	dir = Path("/tmp/stable-diffusion")
	if not dir.exists():
	dir.mkdir(exist_ok=True, parents=True)

	with stub.run():
	sd = StableDiffusion()
	for i in range(samples):
	t0 = time.time()
	image_bytes = sd.run_inference.call(prompt)
	output_path = dir / f"output_{i}.png"
	print(f"Sample {i} took {time.time()-t0:.3f}s. Saving it to {output_path}")
	with open(output_path, "wb") as f:
	f.write(image_bytes)


	if __name__ == "__main__":
	download_model()
	run_inference()