yirenlu92/heroicons_fine_tuning.py

## heroicons_fine_tuning.py
import os
import sys
from dataclasses import dataclass
from pathlib import Path

from fastapi import FastAPI
from modal import Image, App, Volume, gpu, Secret, enter, method, asgi_app

GIT_SHA = "abd922bd0c43a504e47eca2ed354c3634bd00834"  # specify the commit to fetch

image = (
    Image.debian_slim(python_version="3.10")
    .pip_install(
        "accelerate==0.27.2",
        "datasets~=2.19.1",
        "ftfy~=6.1.1",
        "gradio~=3.50.2",
        "smart_open~=6.4.0",
        "transformers~=4.38.1",
        "torch~=2.2.0",
        "torchvision~=0.16",
        "triton~=2.2.0",
        "peft==0.7.0",
        "wandb==0.16.3",
    )
    .apt_install("git")
    # Perform a shallow fetch of just the target `diffusers` commit, checking out
    # the commit in the container's current working directory, /root.
    .run_commands(
        "cd /root && git init .",
        "cd /root && git remote add origin https://github.com/huggingface/diffusers",
        f"cd /root && git fetch --depth=1 origin {GIT_SHA} && git checkout {GIT_SHA}",
        "cd /root && pip install -e .",
    )
)

# ## Set up `Volume`s for training data and model output
#
# Modal can't access your local filesystem, so you should set up a `Volume` to eventually save the model once training is finished.

web_app = FastAPI()

# 4000 training steps, on full heroicons with captions without HCON prefix
app = App(name="example-diffusers-app-05-15-2024-full-heroicons")

MODEL_DIR = Path("/model")
model_volume = Volume.from_name(
    "diffusers-model-volume-05-15-2024-full-heroicons", create_if_missing=True
)

VOLUME_CONFIG = {
    MODEL_DIR: model_volume,
}

DATASET_NAME = [
    "yirenlu/heroicons-without-hcon",
    # "yirenlu/heroicons-subset-100-images",
]

RESOLUTIONS = [128]


# ## Set up config
#
# Each Diffusers example script takes a different set of hyperparameters, so you will need to customize the config depending on the hyperparameters of the script. The code below shows some example parameters.


@dataclass
class TrainConfig:
    """Configuration for the finetuning training."""

    # identifier for pretrained model on Hugging Face
    model_name: str = "runwayml/stable-diffusion-v1-5"

    # resume_from_checkpoint: str = "/model/yirenlu/heroicons_512/checkpoint-6000/"
    # HuggingFace Hub dataset
    dataset_name = "yirenlu/heroicons"

    # Hyperparameters/constants from some of the Diffusers examples
    # You should modify these to match the hyperparameters of the script you are using.
    mixed_precision: str = "fp16"  # set the precision of floats during training, fp16 or less needs to be mixed with fp32 under the hood
    resolution: int = 128  # images will be sized to this resolution
    max_train_steps: int = (
        5000  # number of times to apply a gradient update during training
    )
    checkpointing_steps: int = (
        1000  # number of steps between model checkpoints, for resuming training
    )
    train_batch_size: int = 1  # how many images to process at once, limited by GPU VRAM
    gradient_accumulation_steps: int = 4  # how many batches to process before updating the model, stabilizes training with large batch sizes
    learning_rate: float = 1e-05  # scaling factor on gradient updates, make this proportional to the batch size * accumulation steps
    lr_scheduler: str = (
        "constant"  # dynamic schedule for changes to the base learning_rate
    )
    lr_warmup_steps: int = 0  # for non-constant lr schedules, how many steps to spend increasing the learning_rate from a small initial value
    max_grad_norm: int = 1  # value above which to clip gradients, stabilizes training
    caption_column: str = "text"  # name of the column in the dataset that contains the captions of the images
    validation_prompt: str = "an icon of a dragon creature"


@dataclass
class AppConfig:
    """Configuration information for inference."""

    num_inference_steps: int = 50
    guidance_scale: float = 20


@app.function(
    image=image,
    gpu=gpu.A100(
        size="80GB"
    ),  # finetuning is VRAM hungry, so this should be an A100 or H100
    volumes=VOLUME_CONFIG,
    timeout=3600 * 5,  # multiple hours
    secrets=[Secret.from_name("huggingface-secret-ren")],
    _allow_background_volume_commits=True,
)
def train(hyperparameter_config):
    import huggingface_hub
    from accelerate import notebook_launcher
    from accelerate.utils import write_basic_config

    # change this line to import the training script you want to use
    from examples.text_to_image.train_text_to_image import main
    from transformers import CLIPTokenizer

    # set up TrainConfig
    config = TrainConfig()

    # set up hugging face accelerate library for fast training
    write_basic_config(mixed_precision="fp16")

    # authenticate to hugging face so we can download the model weights
    hf_key = os.environ["HF_TOKEN"]
    huggingface_hub.login(hf_key)

    # check whether we can access the model repo
    try:
        CLIPTokenizer.from_pretrained(config.model_name, subfolder="tokenizer")
    except OSError as e:  # handle error raised when license is not accepted
        license_error_msg = f"Unable to load tokenizer. Access to this model requires acceptance of the license on Hugging Face here: https://huggingface.co/{config.model_name}."
        raise Exception(license_error_msg) from e

    def launch_training():
        sys.argv = [
            "examples/text_to_image/train_text_to_image.py",  # potentially modify
            f"--pretrained_model_name_or_path={config.model_name}",
            f"--dataset_name={hyperparameter_config['dataset_name']}",
            "--use_ema",
            f"--output_dir={hyperparameter_config['output_dir']}",
            f"--resolution={hyperparameter_config['resolution']}",
            "--center_crop",
            "--random_flip",
            f"--gradient_accumulation_steps={config.gradient_accumulation_steps}",
            "--gradient_checkpointing",
            f"--train_batch_size={config.train_batch_size}",
            f"--learning_rate={config.learning_rate}",
            f"--lr_scheduler={config.lr_scheduler}",
            f"--max_train_steps={config.max_train_steps}",
            f"--lr_warmup_steps={config.lr_warmup_steps}",
            f"--checkpointing_steps={config.checkpointing_steps}",
            # f"--resume_from_checkpoint={hyperparameter_config['checkpoint_dir']}",
        ]

        main()

    # run training -- see huggingface accelerate docs for details
    print("launching fine-tuning training script")

    notebook_launcher(launch_training, num_processes=1)
    # The trained model artefacts have been output to the volume mounted at `MODEL_DIR`.

    model_volume.commit()


@app.local_entrypoint()
def run():
    hyperparameter_search = list(
        [
            {
                "dataset_name": dataset_name,
                "resolution": resolution,
                "output_dir": MODEL_DIR / f"{dataset_name}_{resolution}",
                "checkpoint_dir": MODEL_DIR
                / f"{dataset_name}_{resolution}/checkpoint-6000/",
            }
            for dataset_name in DATASET_NAME
            for resolution in RESOLUTIONS
        ]
    )
    for x in train.map(hyperparameter_search):
        print(x)


@app.cls(
    image=image,
    gpu="A10G",  # inference requires less VRAM than training, so we can use a cheaper GPU
    volumes=VOLUME_CONFIG,  # mount the location where your model weights were saved to
)
class Model:
    @enter()
    def load_model(self):
        import torch

        from diffusers import StableDiffusionPipeline, UNet2DConditionModel

        # Reload the modal.Volume to ensure the latest state is accessible.
        model_volume.reload()

        unet = UNet2DConditionModel.from_pretrained(
            MODEL_DIR / "yirenlu/heroicons-without-hcon_128/checkpoint-3000/unet",
            torch_dtype=torch.float16,
        )

        pipe = StableDiffusionPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16
        )
        pipe.to("cuda")

        # pipe.enable_xformers_memory_efficient_attention()

        # pipe = StableDiffusionPipeline.from_pretrained(
        #     MODEL_DIR / "yirenlu/heroicons-without-hcon_128", torch_dtype=torch.float16
        # )
        # pipe.to("cuda")
        # pipe.enable_xformers_memory_efficient_attention()

        self.pipe = pipe

    @method()
    def inference(self, text, config):
        image = self.pipe(
            text,
            num_inference_steps=config.num_inference_steps,
            guidance_scale=config.guidance_scale,
        ).images[0]

        return image


assets_path = Path(__file__).parent / "assets"


@app.function(
    image=image,
    concurrency_limit=3,
)
@asgi_app()
def fastapi_app():
    import gradio as gr
    from gradio.routes import mount_gradio_app

    # Call to the GPU inference function on Modal.
    def go(text):
        return Model().inference.remote(text, config)

    # set up AppConfig
    config = AppConfig()

    HCON_prefix = "an icon of"

    example_prompts = [
        f"{HCON_prefix} a movie ticket",
        f"{HCON_prefix} Barack Obama",
        f"{HCON_prefix} a castle",
        f"{HCON_prefix} a German Shepherd",
    ]

    modal_docs_url = "https://modal.com/docs/guide"
    modal_example_url = f"{modal_docs_url}/examples/train_and_serve_diffusers_script"

    description = """Describe a concept that you would like drawn as a [Heroicon](https://heroicons.com/). Try the examples below for inspiration."""

    # add a gradio UI around inference
    interface = gr.Interface(
        fn=go,
        inputs="text",
        outputs=gr.Image(shape=(512, 512)),
        title="Generate custom heroicons",
        examples=example_prompts,
        description=description,
        css="/assets/index.css",
        allow_flagging="never",
    )

    # mount for execution on Modal
    return mount_gradio_app(
        app=web_app,
        blocks=interface,
        path="/",
    )
	import os
	import sys
	from dataclasses import dataclass
	from pathlib import Path

	from fastapi import FastAPI
	from modal import Image, App, Volume, gpu, Secret, enter, method, asgi_app

	GIT_SHA = "abd922bd0c43a504e47eca2ed354c3634bd00834" # specify the commit to fetch

	image = (
	Image.debian_slim(python_version="3.10")
	.pip_install(
	"accelerate==0.27.2",
	"datasets~=2.19.1",
	"ftfy~=6.1.1",
	"gradio~=3.50.2",
	"smart_open~=6.4.0",
	"transformers~=4.38.1",
	"torch~=2.2.0",
	"torchvision~=0.16",
	"triton~=2.2.0",
	"peft==0.7.0",
	"wandb==0.16.3",
	)
	.apt_install("git")
	# Perform a shallow fetch of just the target `diffusers` commit, checking out
	# the commit in the container's current working directory, /root.
	.run_commands(
	"cd /root && git init .",
	"cd /root && git remote add origin https://github.com/huggingface/diffusers",
	f"cd /root && git fetch --depth=1 origin {GIT_SHA} && git checkout {GIT_SHA}",
	"cd /root && pip install -e .",
	)
	)

	# ## Set up `Volume`s for training data and model output
	#
	# Modal can't access your local filesystem, so you should set up a `Volume` to eventually save the model once training is finished.

	web_app = FastAPI()

	# 4000 training steps, on full heroicons with captions without HCON prefix
	app = App(name="example-diffusers-app-05-15-2024-full-heroicons")

	MODEL_DIR = Path("/model")
	model_volume = Volume.from_name(
	"diffusers-model-volume-05-15-2024-full-heroicons", create_if_missing=True
	)

	VOLUME_CONFIG = {
	MODEL_DIR: model_volume,
	}

	DATASET_NAME = [
	"yirenlu/heroicons-without-hcon",
	# "yirenlu/heroicons-subset-100-images",
	]

	RESOLUTIONS = [128]


	# ## Set up config
	#
	# Each Diffusers example script takes a different set of hyperparameters, so you will need to customize the config depending on the hyperparameters of the script. The code below shows some example parameters.


	@dataclass
	class TrainConfig:
	"""Configuration for the finetuning training."""

	# identifier for pretrained model on Hugging Face
	model_name: str = "runwayml/stable-diffusion-v1-5"

	# resume_from_checkpoint: str = "/model/yirenlu/heroicons_512/checkpoint-6000/"
	# HuggingFace Hub dataset
	dataset_name = "yirenlu/heroicons"

	# Hyperparameters/constants from some of the Diffusers examples
	# You should modify these to match the hyperparameters of the script you are using.
	mixed_precision: str = "fp16" # set the precision of floats during training, fp16 or less needs to be mixed with fp32 under the hood
	resolution: int = 128 # images will be sized to this resolution
	max_train_steps: int = (
	5000 # number of times to apply a gradient update during training
	)
	checkpointing_steps: int = (
	1000 # number of steps between model checkpoints, for resuming training
	)
	train_batch_size: int = 1 # how many images to process at once, limited by GPU VRAM
	gradient_accumulation_steps: int = 4 # how many batches to process before updating the model, stabilizes training with large batch sizes
	learning_rate: float = 1e-05 # scaling factor on gradient updates, make this proportional to the batch size * accumulation steps
	lr_scheduler: str = (
	"constant" # dynamic schedule for changes to the base learning_rate
	)
	lr_warmup_steps: int = 0 # for non-constant lr schedules, how many steps to spend increasing the learning_rate from a small initial value
	max_grad_norm: int = 1 # value above which to clip gradients, stabilizes training
	caption_column: str = "text" # name of the column in the dataset that contains the captions of the images
	validation_prompt: str = "an icon of a dragon creature"


	@dataclass
	class AppConfig:
	"""Configuration information for inference."""

	num_inference_steps: int = 50
	guidance_scale: float = 20


	@app.function(
	image=image,
	gpu=gpu.A100(
	size="80GB"
	), # finetuning is VRAM hungry, so this should be an A100 or H100
	volumes=VOLUME_CONFIG,
	timeout=3600 * 5, # multiple hours
	secrets=[Secret.from_name("huggingface-secret-ren")],
	_allow_background_volume_commits=True,
	)
	def train(hyperparameter_config):
	import huggingface_hub
	from accelerate import notebook_launcher
	from accelerate.utils import write_basic_config

	# change this line to import the training script you want to use
	from examples.text_to_image.train_text_to_image import main
	from transformers import CLIPTokenizer

	# set up TrainConfig
	config = TrainConfig()

	# set up hugging face accelerate library for fast training
	write_basic_config(mixed_precision="fp16")

	# authenticate to hugging face so we can download the model weights
	hf_key = os.environ["HF_TOKEN"]
	huggingface_hub.login(hf_key)

	# check whether we can access the model repo
	try:
	CLIPTokenizer.from_pretrained(config.model_name, subfolder="tokenizer")
	except OSError as e: # handle error raised when license is not accepted
	license_error_msg = f"Unable to load tokenizer. Access to this model requires acceptance of the license on Hugging Face here: https://huggingface.co/{config.model_name}."
	raise Exception(license_error_msg) from e

	def launch_training():
	sys.argv = [
	"examples/text_to_image/train_text_to_image.py", # potentially modify
	f"--pretrained_model_name_or_path={config.model_name}",
	f"--dataset_name={hyperparameter_config['dataset_name']}",
	"--use_ema",
	f"--output_dir={hyperparameter_config['output_dir']}",
	f"--resolution={hyperparameter_config['resolution']}",
	"--center_crop",
	"--random_flip",
	f"--gradient_accumulation_steps={config.gradient_accumulation_steps}",
	"--gradient_checkpointing",
	f"--train_batch_size={config.train_batch_size}",
	f"--learning_rate={config.learning_rate}",
	f"--lr_scheduler={config.lr_scheduler}",
	f"--max_train_steps={config.max_train_steps}",
	f"--lr_warmup_steps={config.lr_warmup_steps}",
	f"--checkpointing_steps={config.checkpointing_steps}",
	# f"--resume_from_checkpoint={hyperparameter_config['checkpoint_dir']}",
	]

	main()

	# run training -- see huggingface accelerate docs for details
	print("launching fine-tuning training script")

	notebook_launcher(launch_training, num_processes=1)
	# The trained model artefacts have been output to the volume mounted at `MODEL_DIR`.

	model_volume.commit()


	@app.local_entrypoint()
	def run():
	hyperparameter_search = list(
	[
	{
	"dataset_name": dataset_name,
	"resolution": resolution,
	"output_dir": MODEL_DIR / f"{dataset_name}_{resolution}",
	"checkpoint_dir": MODEL_DIR
	/ f"{dataset_name}_{resolution}/checkpoint-6000/",
	}
	for dataset_name in DATASET_NAME
	for resolution in RESOLUTIONS
	]
	)
	for x in train.map(hyperparameter_search):
	print(x)


	@app.cls(
	image=image,
	gpu="A10G", # inference requires less VRAM than training, so we can use a cheaper GPU
	volumes=VOLUME_CONFIG, # mount the location where your model weights were saved to
	)
	class Model:
	@enter()
	def load_model(self):
	import torch

	from diffusers import StableDiffusionPipeline, UNet2DConditionModel

	# Reload the modal.Volume to ensure the latest state is accessible.
	model_volume.reload()

	unet = UNet2DConditionModel.from_pretrained(
	MODEL_DIR / "yirenlu/heroicons-without-hcon_128/checkpoint-3000/unet",
	torch_dtype=torch.float16,
	)

	pipe = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16
	)
	pipe.to("cuda")

	# pipe.enable_xformers_memory_efficient_attention()

	# pipe = StableDiffusionPipeline.from_pretrained(
	# MODEL_DIR / "yirenlu/heroicons-without-hcon_128", torch_dtype=torch.float16
	# )
	# pipe.to("cuda")
	# pipe.enable_xformers_memory_efficient_attention()

	self.pipe = pipe

	@method()
	def inference(self, text, config):
	image = self.pipe(
	text,
	num_inference_steps=config.num_inference_steps,
	guidance_scale=config.guidance_scale,
	).images[0]

	return image


	assets_path = Path(__file__).parent / "assets"


	@app.function(
	image=image,
	concurrency_limit=3,
	)
	@asgi_app()
	def fastapi_app():
	import gradio as gr
	from gradio.routes import mount_gradio_app

	# Call to the GPU inference function on Modal.
	def go(text):
	return Model().inference.remote(text, config)

	# set up AppConfig
	config = AppConfig()

	HCON_prefix = "an icon of"

	example_prompts = [
	f"{HCON_prefix} a movie ticket",
	f"{HCON_prefix} Barack Obama",
	f"{HCON_prefix} a castle",
	f"{HCON_prefix} a German Shepherd",
	]

	modal_docs_url = "https://modal.com/docs/guide"
	modal_example_url = f"{modal_docs_url}/examples/train_and_serve_diffusers_script"

	description = """Describe a concept that you would like drawn as a [Heroicon](https://heroicons.com/). Try the examples below for inspiration."""

	# add a gradio UI around inference
	interface = gr.Interface(
	fn=go,
	inputs="text",
	outputs=gr.Image(shape=(512, 512)),
	title="Generate custom heroicons",
	examples=example_prompts,
	description=description,
	css="/assets/index.css",
	allow_flagging="never",
	)

	# mount for execution on Modal
	return mount_gradio_app(
	app=web_app,
	blocks=interface,
	path="/",
	)