affableroots/db_plus_ti.py Secret

## db_plus_ti.py
'''

Extend v1.0 with addition of custom tokens to tokenizer vocabulary


USAGE Example:


export MAX_TRAIN_STEPS=500
export NUM_CLASS_IMAGES=1500
export TRAIN_BATCH_SIZE=2
export PRIOR_LW=1.0
export GRAD_ACCUM_STEPS=2
export LEARNING_RATE=1e-5        # UNET
export ADAM_WEIGHT_DECAY=0.0     # UNET
export LEARNING_RATE_TE=1e-3     # TEXT EMBEDDINGS
export ADAM_WEIGHT_DECAY_TE=1e-2 # TEXT EMBEDDINGS

export MODEL_NAME=../stable-diffusion-v1-4
export INSTANCE_DIR=../the_imgs_to_train
export CLASS_DIR=../the_regularization_imgs
export OUTPUT_DIR=./my_new_concept
export INSTANCE_PROMPT="a photo of <myface0> <myface1> <myface2>" # Multiple tokens possible

accelerate launch examples/dreambooth/train_dreambooth_2.py \
  --class_prompt="a photo of a man's portrait" \
  --class_data_dir=$CLASS_DIR \
  --num_class_images="$NUM_CLASS_IMAGES" \
  --with_prior_preservation \
  --prior_loss_weight="$PRIOR_LW" \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --instance_data_dir=$INSTANCE_DIR \
  --output_dir=$OUTPUT_DIR \
  --placeholder_tokens "<myface0>" "<myface1>" "<myface2>" \
  --initial_tokens "face" "male" "man" \
  --instance_prompt="$INSTANCE_PROMPT" \
  --resolution=512 \
  --train_batch_size=$TRAIN_BATCH_SIZE \
  --gradient_accumulation_steps=$GRAD_ACCUM_STEPS --gradient_checkpointing \
  --learning_rate="$LEARNING_RATE" \
  --learning_rate_te="$LEARNING_RATE_TE" \
  --lr_scheduler="constant_with_warmup" \
  --lr_warmup_steps=50 \
  --max_train_steps="$MAX_TRAIN_STEPS" \
  --gradient_checkpointing \
  --use_8bit_adam \
  --adam_weight_decay="$ADAM_WEIGHT_DECAY" \
  --adam_weight_decay_te="$ADAM_WEIGHT_DECAY_TE"

'''

import itertools
import argparse
import math
import os
from contextlib import nullcontext
from pathlib import Path
from typing import Optional

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset

from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
import numpy as np

from transformers import logging
logging.set_verbosity_error()

logger = get_logger(__name__)

def add_token(tokenizer, text_encoder, tokens, initial_tokens):
    '''Add a list of new strings for the tokenizer to respect. Their weights will be
    set to those from initial_tokens.
    '''
    with torch.no_grad():
        token_embedding = text_encoder.get_input_embeddings()
        # Log info
        l1 = len(tokenizer)
        emb1 = str(text_encoder)
        emb1_shape = str(token_embedding.weight.shape)
        print(f'ADDING TOKENS: {tokens}')
        print(f'Tokenizer Len Before: {l1}')
        print(f'Embedding Before: {emb1_shape}')
        # ADD TOKENS
        old_ids = tokenizer.encode(tokens, add_special_tokens=False)
        num_added_toks = tokenizer.add_tokens(tokens)
        l2 = len(tokenizer)
        # RESIZE EMBEDDINGS
        text_encoder.resize_token_embeddings(l2)
        emb2 = str(token_embedding)
        emb2_shape = str(token_embedding.weight.shape)
        new_ids = tokenizer.encode(tokens, add_special_tokens=False)
        # Log info
        print(f'Tokenizer Len After : {l2}')
        print(f'Tokenizer Length: {l1} --> {l2}')
        print(f'Added {num_added_toks}')
        print(f'Old Ids: {old_ids}')
        print(f'New Ids: {new_ids}')
        print(f'Embedding After : {emb2} {emb2_shape}')
        # SET EMBEDDINGS TO INITIALS
        print('Updating embedding to the initial_tokens')
        w = text_encoder.get_input_embeddings().weight.data
        for old_id, tok_id, init_tok in zip(old_ids, new_ids, initial_tokens):
            if old_id == tok_id: # it was already in there, don't overwrite (eg probably re-training)
                continue
            init_id = tokenizer.encode(init_tok, add_special_tokens=False)[0]
            init_w = w[init_id]
            tok_w1 = w[tok_id].detach().cpu().clone()
            ini_w  = w[init_id]
            w[tok_id, :] = ini_w
            tok_w2 = w[tok_id]
            print(f'Token weight {init_tok} id={init_id}: {init_w[:5]}...')
            print(f'Token weight {tok_id} before update : {tok_w1[:5]}...')
            print(f'Token weight {tok_id} after update  : {tok_w2[:5]}...')


def freeze_params(params):
    for param in params:
        param.requires_grad = False


def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str,
        default=None,
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--tokenizer_name",
        type=str,
        default=None,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--instance_data_dir",
        type=str,
        default=None,
        required=True,
        help="A folder containing the training data of instance images.",
    )
    parser.add_argument(
        "--class_data_dir",
        type=str,
        default=None,
        required=False,
        help="A folder containing the training data of class images.",
    )
    parser.add_argument(
        "--instance_prompt",
        type=str,
        default=None,
        help="The prompt with identifier specifing the instance",
    )
    parser.add_argument(
        "--class_prompt",
        type=str,
        default=None,
        help="The prompt to specify images in the same class as provided intance images.",
    )

    ##################################################

    parser.add_argument(
        "--placeholder_tokens",
        type=str,
        nargs='+',
        default=None,
        help="The new tokens to add to the vocabulary.",
    )
    parser.add_argument(
        "--initial_tokens",
        type=str,
        nargs='+',
        default=None,
        help="Copy the embedding weights from these extant tokens to the new placeholder tokens.",
    )

    parser.add_argument(
        "--learning_rate_te",
        type=float,
        default=5e-6,
        help="Learning rate for Text Embeddings (not on UNET).",
    )

    parser.add_argument("--adam_weight_decay_te", type=float, default=1e-2, help="Weight decay to use on text embeddings (not on UNET).")

    ##################################################

    parser.add_argument(
        "--with_prior_preservation",
        default=False,
        action="store_true",
        help="Flag to add prior perservation loss.",
    )
    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
    parser.add_argument(
        "--num_class_images",
        type=int,
        default=100,
        help=(
            "Minimal class images for prior perversation loss. If not have enough images, additional images will be"
            " sampled with class_prompt."
        ),
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="text-inversion-model",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--resolution",
        type=int,
        default=512,
        help=(
            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
            " resolution"
        ),
    )
    parser.add_argument(
        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
    )
    parser.add_argument(
        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
    )
    parser.add_argument("--num_train_epochs", type=int, default=1)
    parser.add_argument(
        "--max_train_steps",
        type=int,
        default=None,
        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--gradient_checkpointing",
        action="store_true",
        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=5e-6,
        help="Initial learning rate (after the potential warmup period) to use.",
    )
    parser.add_argument(
        "--scale_lr",
        action="store_true",
        default=False,
        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
    )
    parser.add_argument(
        "--lr_scheduler",
        type=str,
        default="constant",
        help=(
            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
            ' "constant", "constant_with_warmup"]'
        ),
    )
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
    )
    parser.add_argument(
        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=0.0, help="Weight decay to use.")
    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--use_auth_token",
        action="store_true",
        help=(
            "Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
            " private models)."
        ),
    )
    parser.add_argument(
        "--logging_dir",
        type=str,
        default="logs",
        help=(
            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
        ),
    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default="no",
        choices=["no", "fp16", "bf16"],
        help=(
            "Whether to use mixed precision. Choose"
            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
            "and an Nvidia Ampere GPU."
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank

    if args.instance_data_dir is None:
        raise ValueError("You must specify a train data directory.")

    if args.with_prior_preservation:
        if args.class_data_dir is None:
            raise ValueError("You must specify a data directory for class images.")
        if args.class_prompt is None:
            raise ValueError("You must specify prompt for class images.")

    return args


class DreamBoothDataset(Dataset):
    """
    A dataset to prepare the instance and class images with the promots for fine-tuning the model.
    It pre-processes the images and the tokenizes prompts.
    """

    def __init__(
        self,
        instance_data_root,
        instance_prompt,
        tokenizer,
        class_data_root=None,
        class_prompt=None,
        size=512,
        center_crop=False,
    ):
        self.size = size
        self.center_crop = center_crop
        self.tokenizer = tokenizer

        self.instance_data_root = Path(instance_data_root)
        if not self.instance_data_root.exists():
            raise ValueError("Instance images root doesn't exists.")

        self.instance_images_path = list(Path(instance_data_root).iterdir())
        self.num_instance_images = len(self.instance_images_path)
        self.instance_prompt = instance_prompt
        self._length = self.num_instance_images

        if class_data_root is not None:
            self.class_data_root = Path(class_data_root)
            self.class_data_root.mkdir(parents=True, exist_ok=True)
            self.class_images_path = list(Path(class_data_root).iterdir())
            self.num_class_images = len(self.class_images_path)
            self._length = max(self.num_class_images, self.num_instance_images)
            self.class_prompt = class_prompt
        else:
            self.class_data_root = None

        self.image_transforms = transforms.Compose(
            [
                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
            ]
        )

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        example = {}
        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
        if not instance_image.mode == "RGB":
            instance_image = instance_image.convert("RGB")
        example["instance_images"] = self.image_transforms(instance_image)
        example["instance_prompt_ids"] = self.tokenizer(
            self.instance_prompt,
            padding="do_not_pad",
            truncation=True,
            max_length=self.tokenizer.model_max_length,
        ).input_ids

        if self.class_data_root:
            class_image = Image.open(self.class_images_path[index % self.num_class_images])
            if not class_image.mode == "RGB":
                class_image = class_image.convert("RGB")
            example["class_images"] = self.image_transforms(class_image)
            example["class_prompt_ids"] = self.tokenizer(
                self.class_prompt,
                padding="do_not_pad",
                truncation=True,
                max_length=self.tokenizer.model_max_length,
            ).input_ids

        return example


class PromptDataset(Dataset):
    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
        self.num_samples = num_samples

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        example = {}
        example["prompt"] = self.prompt
        example["index"] = index
        return example


def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
    if token is None:
        token = HfFolder.get_token()
    if organization is None:
        username = whoami(token)["name"]
        return f"{username}/{model_id}"
    else:
        return f"{organization}/{model_id}"


def main():
    args = parse_args()
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
        log_with="tensorboard",
        logging_dir=logging_dir,
    )

    if args.seed is not None:
        set_seed(args.seed)

    if args.with_prior_preservation:
        class_images_dir = Path(args.class_data_dir)
        if not class_images_dir.exists():
            class_images_dir.mkdir(parents=True)
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
            pipeline = StableDiffusionPipeline.from_pretrained(
                args.pretrained_model_name_or_path, use_auth_token=args.use_auth_token, torch_dtype=torch_dtype
            )
            pipeline.set_progress_bar_config(disable=True)

            num_new_images = args.num_class_images - cur_class_images
            logger.info(f"Number of class images to sample: {num_new_images}.")

            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)

            sample_dataloader = accelerator.prepare(sample_dataloader)
            pipeline.to(accelerator.device)

            context = torch.autocast("cuda") if accelerator.device.type == "cuda" else nullcontext
            for example in tqdm(
                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
            ):
                with context:
                    images = pipeline(example["prompt"]).images

                for i, image in enumerate(images):
                    image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")

            del pipeline
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    # Load the tokenizer
    if args.tokenizer_name:
        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
    elif args.pretrained_model_name_or_path:
        tokenizer = CLIPTokenizer.from_pretrained(
            args.pretrained_model_name_or_path, subfolder="tokenizer", use_auth_token=args.use_auth_token
        )

    # Load models and create wrapper for stable diffusion
    text_encoder = CLIPTextModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", use_auth_token=args.use_auth_token
    )
    vae = AutoencoderKL.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="vae", use_auth_token=args.use_auth_token
    )
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="unet", use_auth_token=args.use_auth_token
    )


    ##################################################
    # Textual Inversion

    # Freeze everything but the input_embedding
    params_to_freeze = itertools.chain(
        text_encoder.text_model.encoder.parameters(),
        text_encoder.text_model.final_layer_norm.parameters(),
        text_encoder.text_model.embeddings.position_embedding.parameters(),
    )
    freeze_params(params_to_freeze)

    # Insert new token
    add_token(tokenizer, text_encoder, args.placeholder_tokens, args.initial_tokens)
    placeholder_token_ids = tokenizer.encode(args.placeholder_tokens, add_special_tokens=False)


    ##################################################@


    ##################################################
    # Debug
    #
    # I save some tensors outside the main loop so that IN the loop, I can check
    # if they're changing, especially when I think they shouldn't.

    tok1_id = tokenizer("<myface>", add_special_tokens=False)['input_ids'][0]
    emb1_w  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok1_id].clone().numpy()

    tok2_id = tokenizer("man", add_special_tokens=False)['input_ids'][0]
    emb2_w  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok2_id].clone().numpy()

    tok3_id = tokenizer("myface", add_special_tokens=False)['input_ids'][0]
    emb3_w  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok3_id].clone().numpy()


    # VAE
    ci_w = vae.encoder.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
    mb_w = vae.encoder.mid_block.attentions[0].proj_attn.weight.detach().cpu().flatten()[0:100].clone().numpy()
    co_w = vae.encoder.conv_out.weight.detach().cpu().flatten()[0:100].clone().numpy()


    #UNET
    u_w = unet.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()

    ##################################################


    if args.gradient_checkpointing:
        unet.enable_gradient_checkpointing()

    if args.scale_lr:
        args.learning_rate = (
            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
        )

    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs

    if args.use_8bit_adam:
        try:
            import bitsandbytes as bnb
        except ImportError:
            raise ImportError(
                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
            )

        optimizer_class = bnb.optim.AdamW8bit
    else:
        optimizer_class = torch.optim.AdamW

    optimizer = optimizer_class(
        [
            {'params':unet.parameters(),
             'lr': args.learning_rate,
             'weight_decay': args.adam_weight_decay,
            },

            {'params':text_encoder.get_input_embeddings().parameters(),
             'lr': args.learning_rate_te,
             'weight_decay': args.adam_weight_decay_te},
        ],
        betas=(args.adam_beta1, args.adam_beta2),
        eps=args.adam_epsilon,
    )


    # NOTE: SGD can perform better, but learns slower.

    # optimizer = torch.optim.SGD(
    #     [
    #         {'params':unet.parameters(), 'lr' : args.learning_rate},
    #         {'params':text_encoder.get_input_embeddings().parameters(), 'lr' : args.learning_rate},
    #     ],
    #     lr=args.learning_rate,
    #     momentum=0.9
    # )


    noise_scheduler = DDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )

    train_dataset = DreamBoothDataset(
        instance_data_root=args.instance_data_dir,
        instance_prompt=args.instance_prompt,
        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
        class_prompt=args.class_prompt,
        tokenizer=tokenizer,
        size=args.resolution,
        center_crop=args.center_crop,
    )

    def collate_fn(examples):
        input_ids = [example["instance_prompt_ids"] for example in examples]
        pixel_values = [example["instance_images"] for example in examples]

        # Concat class and instance examples for prior preservation.
        # We do this to avoid doing two forward passes.
        if args.with_prior_preservation:
            input_ids += [example["class_prompt_ids"] for example in examples]
            pixel_values += [example["class_images"] for example in examples]

        pixel_values = torch.stack(pixel_values)
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()

        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids

        batch = {
            "input_ids": input_ids,
            "pixel_values": pixel_values,
        }
        return batch

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
    )

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True

    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
    )

    unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        unet, text_encoder, optimizer, train_dataloader, lr_scheduler
    )

    # Move text_encode and vae to gpu
    # text_encoder.to(accelerator.device)
    vae.to(accelerator.device)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
        d = vars(args)
        del d['placeholder_tokens'] # hacky, but `init_trackers` doesn't like lists
        del d['initial_tokens']
        accelerator.init_trackers("dreambooth", config=d)

    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    progress_bar.set_description("Steps")
    global_step = 0


    token_embed_w_copy = text_encoder.get_input_embeddings().weight.data.clone().detach().requires_grad_(False).to(accelerator.device)

    for epoch in range(args.num_train_epochs):
        unet.train()
        text_encoder.train()
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(itertools.chain(unet, text_encoder)):

                ##################################################
                # DEBUG

                with torch.no_grad():
                    emb1_w_  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok1_id].clone().numpy()
                    emb2_w_  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok2_id].clone().numpy()
                    emb3_w_  = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok3_id].clone().numpy()

                    # VAE
                    ci_w_ = vae.encoder.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
                    mb_w_ = vae.encoder.mid_block.attentions[0].proj_attn.weight.detach().cpu().flatten()[0:100].clone().numpy()
                    co_w_ = vae.encoder.conv_out.weight.detach().cpu().flatten()[0:100].clone().numpy()

                    #UNET
                    u_w_ = unet.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()

                    for name, before, after in zip(
                            ['emb1_w', 'emb2_w', 'emb3_w', 'ci_w', 'mb_w', 'co_w', 'u_w'],
                            [emb1_w  ,  emb2_w ,  emb3_w ,  ci_w ,  mb_w ,  co_w,   u_w ],
                            [emb1_w_ ,  emb2_w_,  emb3_w_,  ci_w_,  mb_w_,  co_w_,  u_w_]
                    ):

                        # ignore these changes
                        if name in ['u_w', 'emb1_w']:
                            continue

                        if not np.allclose(before, after):
                            print(f'CHANGED: {name}: \n{before[:10]} \n{after[:10]}')

                ##################################################


                # Convert images to latent space
                with torch.no_grad():
                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                    latents = latents * 0.18215

                # Sample noise that we'll add to the latents
                noise = torch.randn(latents.shape).to(latents.device)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                # with torch.no_grad():
                encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                # Predict the noise residual
                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

                if args.with_prior_preservation:
                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
                    noise, noise_prior = torch.chunk(noise, 2, dim=0)

                    # Compute instance loss
                    loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()

                    # Compute prior loss
                    prior_loss = F.mse_loss(noise_pred_prior, noise_prior, reduction="none").mean([1, 2, 3]).mean()

                    # Add the prior loss to the instance loss.
                    loss = loss + args.prior_loss_weight * prior_loss
                else:
                    loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()

                accelerator.backward(loss)

                # * Textual inversion doesn't clip
                # * Fine-tuning ex only clips during sync:  # https://github.com/huggingface/diffusers/pull/356/files#diff-47856c771c9f57d4cd90fbc472346aeafc70fc5f9e599c41de1d09e4f655b5d1R626
                # * originally DB clipped all the time

                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)


                ##################################################
                # Textual Inversion

                # Zero out the gradients for all token embeddings except the newly added
                # embeddings for the concept, as we only want to optimize the concept embeddings
                if accelerator.num_processes > 1:
                    grads = text_encoder.module.get_input_embeddings().weight.grad
                else:
                    grads = text_encoder.get_input_embeddings().weight.grad

                # Get the index for tokens that we want to zero the grads for
                index_grads_to_zero = torch.ones(len(tokenizer), dtype=bool)
                for pid in placeholder_token_ids:
                    index_grads_to_zero[pid] = False
                grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)

                # # A hack to undo Adam weight decay, keeping TI from updating the
                # # entire embedding weights.
                # if accelerator.sync_gradients:
                #     with torch.no_grad():
                #         text_encoder.get_input_embeddings().weight[index_grads_to_zero, :] += (
                #             lr_scheduler.get_last_lr()[0] *
                #             args.adam_weight_decay *
                #             text_encoder.get_input_embeddings().weight[index_grads_to_zero, :]
                #         )

                # Another hack to keep non-trained embeddings from
                # training. 0-ing the grad doesn't work alone, and this copy
                # step is probably enough, so, the other could be ignored.
                if accelerator.sync_gradients:
                    with torch.no_grad():
                        text_encoder.get_input_embeddings().weight[index_grads_to_zero, :] = token_embed_w_copy[index_grads_to_zero, :]


                ##################################################

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1

            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)

            if global_step >= args.max_train_steps:
                break

        accelerator.wait_for_everyone()

    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
        pipeline = StableDiffusionPipeline.from_pretrained(
            args.pretrained_model_name_or_path,
            unet=accelerator.unwrap_model(unet),
            text_encoder=accelerator.unwrap_model(text_encoder),
            tokenizer=accelerator.unwrap_model(tokenizer),
            use_auth_token=args.use_auth_token,
        )
        pipeline.save_pretrained(args.output_dir)

    accelerator.end_training()


if __name__ == "__main__":
    main()