Skip to content

Instantly share code, notes, and snippets.

@affableroots
Created October 7, 2022 03:53
Show Gist options
  • Save affableroots/a36a74287c8eb2da438a459795b158d6 to your computer and use it in GitHub Desktop.
Save affableroots/a36a74287c8eb2da438a459795b158d6 to your computer and use it in GitHub Desktop.
'''
Extend v1.0 with addition of custom tokens to tokenizer vocabulary
USAGE Example:
export MAX_TRAIN_STEPS=500
export NUM_CLASS_IMAGES=1500
export TRAIN_BATCH_SIZE=2
export PRIOR_LW=1.0
export GRAD_ACCUM_STEPS=2
export LEARNING_RATE=1e-5 # UNET
export ADAM_WEIGHT_DECAY=0.0 # UNET
export LEARNING_RATE_TE=1e-3 # TEXT EMBEDDINGS
export ADAM_WEIGHT_DECAY_TE=1e-2 # TEXT EMBEDDINGS
export MODEL_NAME=../stable-diffusion-v1-4
export INSTANCE_DIR=../the_imgs_to_train
export CLASS_DIR=../the_regularization_imgs
export OUTPUT_DIR=./my_new_concept
export INSTANCE_PROMPT="a photo of <myface0> <myface1> <myface2>" # Multiple tokens possible
accelerate launch examples/dreambooth/train_dreambooth_2.py \
--class_prompt="a photo of a man's portrait" \
--class_data_dir=$CLASS_DIR \
--num_class_images="$NUM_CLASS_IMAGES" \
--with_prior_preservation \
--prior_loss_weight="$PRIOR_LW" \
--pretrained_model_name_or_path=$MODEL_NAME \
--instance_data_dir=$INSTANCE_DIR \
--output_dir=$OUTPUT_DIR \
--placeholder_tokens "<myface0>" "<myface1>" "<myface2>" \
--initial_tokens "face" "male" "man" \
--instance_prompt="$INSTANCE_PROMPT" \
--resolution=512 \
--train_batch_size=$TRAIN_BATCH_SIZE \
--gradient_accumulation_steps=$GRAD_ACCUM_STEPS --gradient_checkpointing \
--learning_rate="$LEARNING_RATE" \
--learning_rate_te="$LEARNING_RATE_TE" \
--lr_scheduler="constant_with_warmup" \
--lr_warmup_steps=50 \
--max_train_steps="$MAX_TRAIN_STEPS" \
--gradient_checkpointing \
--use_8bit_adam \
--adam_weight_decay="$ADAM_WEIGHT_DECAY" \
--adam_weight_decay_te="$ADAM_WEIGHT_DECAY_TE"
'''
import itertools
import argparse
import math
import os
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
import numpy as np
from transformers import logging
logging.set_verbosity_error()
logger = get_logger(__name__)
def add_token(tokenizer, text_encoder, tokens, initial_tokens):
'''Add a list of new strings for the tokenizer to respect. Their weights will be
set to those from initial_tokens.
'''
with torch.no_grad():
token_embedding = text_encoder.get_input_embeddings()
# Log info
l1 = len(tokenizer)
emb1 = str(text_encoder)
emb1_shape = str(token_embedding.weight.shape)
print(f'ADDING TOKENS: {tokens}')
print(f'Tokenizer Len Before: {l1}')
print(f'Embedding Before: {emb1_shape}')
# ADD TOKENS
old_ids = tokenizer.encode(tokens, add_special_tokens=False)
num_added_toks = tokenizer.add_tokens(tokens)
l2 = len(tokenizer)
# RESIZE EMBEDDINGS
text_encoder.resize_token_embeddings(l2)
emb2 = str(token_embedding)
emb2_shape = str(token_embedding.weight.shape)
new_ids = tokenizer.encode(tokens, add_special_tokens=False)
# Log info
print(f'Tokenizer Len After : {l2}')
print(f'Tokenizer Length: {l1} --> {l2}')
print(f'Added {num_added_toks}')
print(f'Old Ids: {old_ids}')
print(f'New Ids: {new_ids}')
print(f'Embedding After : {emb2} {emb2_shape}')
# SET EMBEDDINGS TO INITIALS
print('Updating embedding to the initial_tokens')
w = text_encoder.get_input_embeddings().weight.data
for old_id, tok_id, init_tok in zip(old_ids, new_ids, initial_tokens):
if old_id == tok_id: # it was already in there, don't overwrite (eg probably re-training)
continue
init_id = tokenizer.encode(init_tok, add_special_tokens=False)[0]
init_w = w[init_id]
tok_w1 = w[tok_id].detach().cpu().clone()
ini_w = w[init_id]
w[tok_id, :] = ini_w
tok_w2 = w[tok_id]
print(f'Token weight {init_tok} id={init_id}: {init_w[:5]}...')
print(f'Token weight {tok_id} before update : {tok_w1[:5]}...')
print(f'Token weight {tok_id} after update : {tok_w2[:5]}...')
def freeze_params(params):
for param in params:
param.requires_grad = False
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
default=None,
required=True,
help="Path to pretrained model or model identifier from huggingface.co/models.",
)
parser.add_argument(
"--tokenizer_name",
type=str,
default=None,
help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
"--instance_data_dir",
type=str,
default=None,
required=True,
help="A folder containing the training data of instance images.",
)
parser.add_argument(
"--class_data_dir",
type=str,
default=None,
required=False,
help="A folder containing the training data of class images.",
)
parser.add_argument(
"--instance_prompt",
type=str,
default=None,
help="The prompt with identifier specifing the instance",
)
parser.add_argument(
"--class_prompt",
type=str,
default=None,
help="The prompt to specify images in the same class as provided intance images.",
)
##################################################
parser.add_argument(
"--placeholder_tokens",
type=str,
nargs='+',
default=None,
help="The new tokens to add to the vocabulary.",
)
parser.add_argument(
"--initial_tokens",
type=str,
nargs='+',
default=None,
help="Copy the embedding weights from these extant tokens to the new placeholder tokens.",
)
parser.add_argument(
"--learning_rate_te",
type=float,
default=5e-6,
help="Learning rate for Text Embeddings (not on UNET).",
)
parser.add_argument("--adam_weight_decay_te", type=float, default=1e-2, help="Weight decay to use on text embeddings (not on UNET).")
##################################################
parser.add_argument(
"--with_prior_preservation",
default=False,
action="store_true",
help="Flag to add prior perservation loss.",
)
parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
parser.add_argument(
"--num_class_images",
type=int,
default=100,
help=(
"Minimal class images for prior perversation loss. If not have enough images, additional images will be"
" sampled with class_prompt."
),
)
parser.add_argument(
"--output_dir",
type=str,
default="text-inversion-model",
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
" resolution"
),
)
parser.add_argument(
"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
)
parser.add_argument(
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
)
parser.add_argument(
"--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
)
parser.add_argument("--num_train_epochs", type=int, default=1)
parser.add_argument(
"--max_train_steps",
type=int,
default=None,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--gradient_checkpointing",
action="store_true",
help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-6,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument(
"--scale_lr",
action="store_true",
default=False,
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
)
parser.add_argument(
"--lr_scheduler",
type=str,
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
' "constant", "constant_with_warmup"]'
),
)
parser.add_argument(
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
)
parser.add_argument(
"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
)
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
parser.add_argument("--adam_weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--use_auth_token",
action="store_true",
help=(
"Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
" private models)."
),
)
parser.add_argument(
"--logging_dir",
type=str,
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
),
)
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU."
),
)
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
args = parser.parse_args()
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
if env_local_rank != -1 and env_local_rank != args.local_rank:
args.local_rank = env_local_rank
if args.instance_data_dir is None:
raise ValueError("You must specify a train data directory.")
if args.with_prior_preservation:
if args.class_data_dir is None:
raise ValueError("You must specify a data directory for class images.")
if args.class_prompt is None:
raise ValueError("You must specify prompt for class images.")
return args
class DreamBoothDataset(Dataset):
"""
A dataset to prepare the instance and class images with the promots for fine-tuning the model.
It pre-processes the images and the tokenizes prompts.
"""
def __init__(
self,
instance_data_root,
instance_prompt,
tokenizer,
class_data_root=None,
class_prompt=None,
size=512,
center_crop=False,
):
self.size = size
self.center_crop = center_crop
self.tokenizer = tokenizer
self.instance_data_root = Path(instance_data_root)
if not self.instance_data_root.exists():
raise ValueError("Instance images root doesn't exists.")
self.instance_images_path = list(Path(instance_data_root).iterdir())
self.num_instance_images = len(self.instance_images_path)
self.instance_prompt = instance_prompt
self._length = self.num_instance_images
if class_data_root is not None:
self.class_data_root = Path(class_data_root)
self.class_data_root.mkdir(parents=True, exist_ok=True)
self.class_images_path = list(Path(class_data_root).iterdir())
self.num_class_images = len(self.class_images_path)
self._length = max(self.num_class_images, self.num_instance_images)
self.class_prompt = class_prompt
else:
self.class_data_root = None
self.image_transforms = transforms.Compose(
[
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
def __len__(self):
return self._length
def __getitem__(self, index):
example = {}
instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
example["instance_prompt_ids"] = self.tokenizer(
self.instance_prompt,
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
).input_ids
if self.class_data_root:
class_image = Image.open(self.class_images_path[index % self.num_class_images])
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
example["class_prompt_ids"] = self.tokenizer(
self.class_prompt,
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
).input_ids
return example
class PromptDataset(Dataset):
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
def __init__(self, prompt, num_samples):
self.prompt = prompt
self.num_samples = num_samples
def __len__(self):
return self.num_samples
def __getitem__(self, index):
example = {}
example["prompt"] = self.prompt
example["index"] = index
return example
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
username = whoami(token)["name"]
return f"{username}/{model_id}"
else:
return f"{organization}/{model_id}"
def main():
args = parse_args()
logging_dir = Path(args.output_dir, args.logging_dir)
accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
mixed_precision=args.mixed_precision,
log_with="tensorboard",
logging_dir=logging_dir,
)
if args.seed is not None:
set_seed(args.seed)
if args.with_prior_preservation:
class_images_dir = Path(args.class_data_dir)
if not class_images_dir.exists():
class_images_dir.mkdir(parents=True)
cur_class_images = len(list(class_images_dir.iterdir()))
if cur_class_images < args.num_class_images:
torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
pipeline = StableDiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path, use_auth_token=args.use_auth_token, torch_dtype=torch_dtype
)
pipeline.set_progress_bar_config(disable=True)
num_new_images = args.num_class_images - cur_class_images
logger.info(f"Number of class images to sample: {num_new_images}.")
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
sample_dataloader = accelerator.prepare(sample_dataloader)
pipeline.to(accelerator.device)
context = torch.autocast("cuda") if accelerator.device.type == "cuda" else nullcontext
for example in tqdm(
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
):
with context:
images = pipeline(example["prompt"]).images
for i, image in enumerate(images):
image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")
del pipeline
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Load the tokenizer
if args.tokenizer_name:
tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
elif args.pretrained_model_name_or_path:
tokenizer = CLIPTokenizer.from_pretrained(
args.pretrained_model_name_or_path, subfolder="tokenizer", use_auth_token=args.use_auth_token
)
# Load models and create wrapper for stable diffusion
text_encoder = CLIPTextModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="text_encoder", use_auth_token=args.use_auth_token
)
vae = AutoencoderKL.from_pretrained(
args.pretrained_model_name_or_path, subfolder="vae", use_auth_token=args.use_auth_token
)
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="unet", use_auth_token=args.use_auth_token
)
##################################################
# Textual Inversion
# Freeze everything but the input_embedding
params_to_freeze = itertools.chain(
text_encoder.text_model.encoder.parameters(),
text_encoder.text_model.final_layer_norm.parameters(),
text_encoder.text_model.embeddings.position_embedding.parameters(),
)
freeze_params(params_to_freeze)
# Insert new token
add_token(tokenizer, text_encoder, args.placeholder_tokens, args.initial_tokens)
placeholder_token_ids = tokenizer.encode(args.placeholder_tokens, add_special_tokens=False)
##################################################@
##################################################
# Debug
#
# I save some tensors outside the main loop so that IN the loop, I can check
# if they're changing, especially when I think they shouldn't.
tok1_id = tokenizer("<myface>", add_special_tokens=False)['input_ids'][0]
emb1_w = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok1_id].clone().numpy()
tok2_id = tokenizer("man", add_special_tokens=False)['input_ids'][0]
emb2_w = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok2_id].clone().numpy()
tok3_id = tokenizer("myface", add_special_tokens=False)['input_ids'][0]
emb3_w = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok3_id].clone().numpy()
# VAE
ci_w = vae.encoder.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
mb_w = vae.encoder.mid_block.attentions[0].proj_attn.weight.detach().cpu().flatten()[0:100].clone().numpy()
co_w = vae.encoder.conv_out.weight.detach().cpu().flatten()[0:100].clone().numpy()
#UNET
u_w = unet.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
##################################################
if args.gradient_checkpointing:
unet.enable_gradient_checkpointing()
if args.scale_lr:
args.learning_rate = (
args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
)
# Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
if args.use_8bit_adam:
try:
import bitsandbytes as bnb
except ImportError:
raise ImportError(
"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
)
optimizer_class = bnb.optim.AdamW8bit
else:
optimizer_class = torch.optim.AdamW
optimizer = optimizer_class(
[
{'params':unet.parameters(),
'lr': args.learning_rate,
'weight_decay': args.adam_weight_decay,
},
{'params':text_encoder.get_input_embeddings().parameters(),
'lr': args.learning_rate_te,
'weight_decay': args.adam_weight_decay_te},
],
betas=(args.adam_beta1, args.adam_beta2),
eps=args.adam_epsilon,
)
# NOTE: SGD can perform better, but learns slower.
# optimizer = torch.optim.SGD(
# [
# {'params':unet.parameters(), 'lr' : args.learning_rate},
# {'params':text_encoder.get_input_embeddings().parameters(), 'lr' : args.learning_rate},
# ],
# lr=args.learning_rate,
# momentum=0.9
# )
noise_scheduler = DDPMScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
)
train_dataset = DreamBoothDataset(
instance_data_root=args.instance_data_dir,
instance_prompt=args.instance_prompt,
class_data_root=args.class_data_dir if args.with_prior_preservation else None,
class_prompt=args.class_prompt,
tokenizer=tokenizer,
size=args.resolution,
center_crop=args.center_crop,
)
def collate_fn(examples):
input_ids = [example["instance_prompt_ids"] for example in examples]
pixel_values = [example["instance_images"] for example in examples]
# Concat class and instance examples for prior preservation.
# We do this to avoid doing two forward passes.
if args.with_prior_preservation:
input_ids += [example["class_prompt_ids"] for example in examples]
pixel_values += [example["class_images"] for example in examples]
pixel_values = torch.stack(pixel_values)
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
batch = {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
return batch
train_dataloader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
)
# Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
overrode_max_train_steps = True
lr_scheduler = get_scheduler(
args.lr_scheduler,
optimizer=optimizer,
num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
)
unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
unet, text_encoder, optimizer, train_dataloader, lr_scheduler
)
# Move text_encode and vae to gpu
# text_encoder.to(accelerator.device)
vae.to(accelerator.device)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if accelerator.is_main_process:
d = vars(args)
del d['placeholder_tokens'] # hacky, but `init_trackers` doesn't like lists
del d['initial_tokens']
accelerator.init_trackers("dreambooth", config=d)
# Train!
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
progress_bar.set_description("Steps")
global_step = 0
token_embed_w_copy = text_encoder.get_input_embeddings().weight.data.clone().detach().requires_grad_(False).to(accelerator.device)
for epoch in range(args.num_train_epochs):
unet.train()
text_encoder.train()
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(itertools.chain(unet, text_encoder)):
##################################################
# DEBUG
with torch.no_grad():
emb1_w_ = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok1_id].clone().numpy()
emb2_w_ = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok2_id].clone().numpy()
emb3_w_ = text_encoder.text_model.embeddings.token_embedding.weight.detach().cpu().data[tok3_id].clone().numpy()
# VAE
ci_w_ = vae.encoder.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
mb_w_ = vae.encoder.mid_block.attentions[0].proj_attn.weight.detach().cpu().flatten()[0:100].clone().numpy()
co_w_ = vae.encoder.conv_out.weight.detach().cpu().flatten()[0:100].clone().numpy()
#UNET
u_w_ = unet.conv_in.weight.detach().cpu().flatten()[0:100].clone().numpy()
for name, before, after in zip(
['emb1_w', 'emb2_w', 'emb3_w', 'ci_w', 'mb_w', 'co_w', 'u_w'],
[emb1_w , emb2_w , emb3_w , ci_w , mb_w , co_w, u_w ],
[emb1_w_ , emb2_w_, emb3_w_, ci_w_, mb_w_, co_w_, u_w_]
):
# ignore these changes
if name in ['u_w', 'emb1_w']:
continue
if not np.allclose(before, after):
print(f'CHANGED: {name}: \n{before[:10]} \n{after[:10]}')
##################################################
# Convert images to latent space
with torch.no_grad():
latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
latents = latents * 0.18215
# Sample noise that we'll add to the latents
noise = torch.randn(latents.shape).to(latents.device)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
# Get the text embedding for conditioning
# with torch.no_grad():
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
# Predict the noise residual
noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
if args.with_prior_preservation:
# Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
noise, noise_prior = torch.chunk(noise, 2, dim=0)
# Compute instance loss
loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
# Compute prior loss
prior_loss = F.mse_loss(noise_pred_prior, noise_prior, reduction="none").mean([1, 2, 3]).mean()
# Add the prior loss to the instance loss.
loss = loss + args.prior_loss_weight * prior_loss
else:
loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
accelerator.backward(loss)
# * Textual inversion doesn't clip
# * Fine-tuning ex only clips during sync: # https://github.com/huggingface/diffusers/pull/356/files#diff-47856c771c9f57d4cd90fbc472346aeafc70fc5f9e599c41de1d09e4f655b5d1R626
# * originally DB clipped all the time
if accelerator.sync_gradients:
accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
##################################################
# Textual Inversion
# Zero out the gradients for all token embeddings except the newly added
# embeddings for the concept, as we only want to optimize the concept embeddings
if accelerator.num_processes > 1:
grads = text_encoder.module.get_input_embeddings().weight.grad
else:
grads = text_encoder.get_input_embeddings().weight.grad
# Get the index for tokens that we want to zero the grads for
index_grads_to_zero = torch.ones(len(tokenizer), dtype=bool)
for pid in placeholder_token_ids:
index_grads_to_zero[pid] = False
grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
# # A hack to undo Adam weight decay, keeping TI from updating the
# # entire embedding weights.
# if accelerator.sync_gradients:
# with torch.no_grad():
# text_encoder.get_input_embeddings().weight[index_grads_to_zero, :] += (
# lr_scheduler.get_last_lr()[0] *
# args.adam_weight_decay *
# text_encoder.get_input_embeddings().weight[index_grads_to_zero, :]
# )
# Another hack to keep non-trained embeddings from
# training. 0-ing the grad doesn't work alone, and this copy
# step is probably enough, so, the other could be ignored.
if accelerator.sync_gradients:
with torch.no_grad():
text_encoder.get_input_embeddings().weight[index_grads_to_zero, :] = token_embed_w_copy[index_grads_to_zero, :]
##################################################
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1)
global_step += 1
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
progress_bar.set_postfix(**logs)
accelerator.log(logs, step=global_step)
if global_step >= args.max_train_steps:
break
accelerator.wait_for_everyone()
# Create the pipeline using using the trained modules and save it.
if accelerator.is_main_process:
pipeline = StableDiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=accelerator.unwrap_model(unet),
text_encoder=accelerator.unwrap_model(text_encoder),
tokenizer=accelerator.unwrap_model(tokenizer),
use_auth_token=args.use_auth_token,
)
pipeline.save_pretrained(args.output_dir)
accelerator.end_training()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment