trygvebw/blur_latent_noise.py

## blur_latent_noise.py
# This is an abbreviated demonstration of how to perform this technique. The code
# is a simplified version of that in my own custom codebase, and can't be plugged
# into other ways of using Stable Diffusion (e.g. Diffusers or A1111) without changes.

# In essence, the observation that the CFG formula:
#
#     output_noise = uncond_noise + (cond_noise - uncond_noise) * scale
#
# looks a lot like the formula for the unsharp mask, a common way to sharpen or add local contrast to images:
#
#     sharpened_image = original_image + (original_image − gaussian_blurred_image) * strength
#
# led me to try applying a "reverse" unsharp mask to the conditional output of the model to try to get rid of the
# easily noticeable artifacts and exaggerated contrast and saturation the output images tend to suffer from when
# generating images with Stable Diffusion at high classifier-free guidance (CFG) scales.
#
# I've previously spent quite a bit of time trying to fix this issue, see e.g. my post here:
# https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/
# ...but the results using those techniques are far from perfect: they tend to cause output images to suffer
# from other issues like desaturation, and are not powerful enough to handle extreme CFG scales like 60 or 80.
# In contrast, this method performs a lot better, and by careful tuning of the parameters you can use very
# high CFG scales indeed without substantially degrading output image quality.
#
# One question that's important to answer preemptively is whether effectively blurring the conditional model output
# is "cheating", in the sense that the images generated using this method at high CFG scales are "effectively" generated
# at a much lower CFG scale because of the blur. It's hard to say exactly what an image generated at a high CFG scale
# "should" look like if generated using a proper, theory-backed fix to the problem at hand (which this isn't), but my
# intuition says that at high CFG scales (higher than the usual range of roughly 0.0–15.0), generated images should
#
#     A) display better prompt adherence than one generated at a lower CFG scale
#     B) display less diversity, i.e. two images generated with different initial noises should be more similar
#
# I can't definitely prove that the images generated at high CFG scales using this technique display these two properties,
# but I absolutely *think* they do – you be the judge. :)

# A few additional notes:
#
#     - applying unsharp mask not to the predicted noise but to the predicted final latent (x0) also works (you'll need
#       to experiment with parameters, though) – I apply it to the noise because of the way my own codebase is structured.
#     - since the unsharp mask is simply a convoluted way of applying a Gaussian blur, you can also simply blur the output
#       noise instead of applying an unsharp mask. I do it this way because of the similarity between CFG and the unsharp
#       mask noted previously.
#     - I've unsuccessfully experimented with various methods to autodetect a suitable mix_factor or unsharp mask sigma
#       (by adjusting one or both based on the estimated noisel level, the global contrast level or the maximum edge contrast).
#       If you figure out a smart way to do this, please tell me about it! Right now I use a simple adjustment based on
#       CFG scale, but this is definitely not optimal.
#     - you still can't turn the CFG scale arbitrarily high – the exact max that still gives good-looking images varies,
#       depending on prompt and the number of steps, but on occasion I've successfully used CFG scales just over 100.

import torch
import kornia as KR
import k_diffusion as K

DEFAULT_MIX_FACTOR = 0.003

# The most important parameter here is the mix factor, which is the main parameter
def apply_unsharp_mask(pred_noise, denoising_sigma, cfg_scale, mix_factor=DEFAULT_MIX_FACTOR, kernel_size=3):
    cond_scale_factor = min(0.02 * cfg_scale, 0.65)
    usm_sigma = torch.clamp(
        1 + denoising_sigma[[0]] * cond_scale_factor,
        min=1e-6)

    sharpened = KR.filters.unsharp_mask(
        pred_noise,
        (kernel_size, kernel_size),
        (usm_sigma, usm_sigma),
        border_type='reflect'
    )
    pred_noise = pred_noise + (sharpened - pred_noise) * mix_factor

    return pred_noise

# This class is based on Katherine Crowson's DiscreteEpsDDPMDenoiser:
# https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
class DenoiserWrapper(K.external.DiscreteSchedule):
    def __init__(self, model, quantize, cond_emb, uncond_emb, cfg_scale, unsharp_mask_mix_factor=DEFAULT_MIX_FACTOR):
        super().__init__(((1 - model.alphas_cumprod) / model.alphas_cumprod) ** 0.5, quantize)

        self.inner_model = model
        self.sigma_data = 1.
        self.cond_emb = cond_emb  # Conditional embedding
        self.uncond_emb = uncond_emb  # "Unconditional embedding", i.e. the embedding of the empty string
        self.cfg_scale = cfg_scale
        self.unsharp_mask_mix_factor = unsharp_mask_mix_factor

    def get_scalings(self, sigma):
        c_out = -sigma
        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
        return c_out, c_in

    def get_eps(self, *args, **kwargs):
        return self.inner_model.apply_model(*args, **kwargs)

    def loss(self, input, noise, sigma, **kwargs):
        c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
        noised_input = input + noise * K.utils.append_dims(sigma, input.ndim)
        eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
        return (eps - noise).pow(2).flatten(1).mean(1)

    def forward(self, input, sigma, **kwargs):
        c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]

        uncond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.uncond_emb, **kwargs)

        cond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.cond_emb, **kwargs)
        cond_eps = apply_unsharp_mask(cond_eps, sigma, self.cfg_scale, mix_factor=self.unsharp_mask_mix_factor)

        eps = uncond_eps + (cond_eps - uncond_eps) * self.cfg_scale
        return input + eps * c_out
	# This is an abbreviated demonstration of how to perform this technique. The code
	# is a simplified version of that in my own custom codebase, and can't be plugged
	# into other ways of using Stable Diffusion (e.g. Diffusers or A1111) without changes.

	# In essence, the observation that the CFG formula:
	#
	# output_noise = uncond_noise + (cond_noise - uncond_noise) * scale
	#
	# looks a lot like the formula for the unsharp mask, a common way to sharpen or add local contrast to images:
	#
	# sharpened_image = original_image + (original_image − gaussian_blurred_image) * strength
	#
	# led me to try applying a "reverse" unsharp mask to the conditional output of the model to try to get rid of the
	# easily noticeable artifacts and exaggerated contrast and saturation the output images tend to suffer from when
	# generating images with Stable Diffusion at high classifier-free guidance (CFG) scales.
	#
	# I've previously spent quite a bit of time trying to fix this issue, see e.g. my post here:
	# https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/
	# ...but the results using those techniques are far from perfect: they tend to cause output images to suffer
	# from other issues like desaturation, and are not powerful enough to handle extreme CFG scales like 60 or 80.
	# In contrast, this method performs a lot better, and by careful tuning of the parameters you can use very
	# high CFG scales indeed without substantially degrading output image quality.
	#
	# One question that's important to answer preemptively is whether effectively blurring the conditional model output
	# is "cheating", in the sense that the images generated using this method at high CFG scales are "effectively" generated
	# at a much lower CFG scale because of the blur. It's hard to say exactly what an image generated at a high CFG scale
	# "should" look like if generated using a proper, theory-backed fix to the problem at hand (which this isn't), but my
	# intuition says that at high CFG scales (higher than the usual range of roughly 0.0–15.0), generated images should
	#
	# A) display better prompt adherence than one generated at a lower CFG scale
	# B) display less diversity, i.e. two images generated with different initial noises should be more similar
	#
	# I can't definitely prove that the images generated at high CFG scales using this technique display these two properties,
	# but I absolutely think they do – you be the judge. :)

	# A few additional notes:
	#
	# - applying unsharp mask not to the predicted noise but to the predicted final latent (x0) also works (you'll need
	# to experiment with parameters, though) – I apply it to the noise because of the way my own codebase is structured.
	# - since the unsharp mask is simply a convoluted way of applying a Gaussian blur, you can also simply blur the output
	# noise instead of applying an unsharp mask. I do it this way because of the similarity between CFG and the unsharp
	# mask noted previously.
	# - I've unsuccessfully experimented with various methods to autodetect a suitable mix_factor or unsharp mask sigma
	# (by adjusting one or both based on the estimated noisel level, the global contrast level or the maximum edge contrast).
	# If you figure out a smart way to do this, please tell me about it! Right now I use a simple adjustment based on
	# CFG scale, but this is definitely not optimal.
	# - you still can't turn the CFG scale arbitrarily high – the exact max that still gives good-looking images varies,
	# depending on prompt and the number of steps, but on occasion I've successfully used CFG scales just over 100.

	import torch
	import kornia as KR
	import k_diffusion as K

	DEFAULT_MIX_FACTOR = 0.003

	# The most important parameter here is the mix factor, which is the main parameter
	def apply_unsharp_mask(pred_noise, denoising_sigma, cfg_scale, mix_factor=DEFAULT_MIX_FACTOR, kernel_size=3):
	cond_scale_factor = min(0.02 * cfg_scale, 0.65)
	usm_sigma = torch.clamp(
	1 + denoising_sigma[[0]] * cond_scale_factor,
	min=1e-6)

	sharpened = KR.filters.unsharp_mask(
	pred_noise,
	(kernel_size, kernel_size),
	(usm_sigma, usm_sigma),
	border_type='reflect'
	)
	pred_noise = pred_noise + (sharpened - pred_noise) * mix_factor

	return pred_noise

	# This class is based on Katherine Crowson's DiscreteEpsDDPMDenoiser:
	# https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
	class DenoiserWrapper(K.external.DiscreteSchedule):
	def __init__(self, model, quantize, cond_emb, uncond_emb, cfg_scale, unsharp_mask_mix_factor=DEFAULT_MIX_FACTOR):
	super().__init__(((1 - model.alphas_cumprod) / model.alphas_cumprod) ** 0.5, quantize)

	self.inner_model = model
	self.sigma_data = 1.
	self.cond_emb = cond_emb # Conditional embedding
	self.uncond_emb = uncond_emb # "Unconditional embedding", i.e. the embedding of the empty string
	self.cfg_scale = cfg_scale
	self.unsharp_mask_mix_factor = unsharp_mask_mix_factor

	def get_scalings(self, sigma):
	c_out = -sigma
	c_in = 1 / (sigma 2 + self.sigma_data 2) ** 0.5
	return c_out, c_in

	def get_eps(self, args, *kwargs):
	return self.inner_model.apply_model(args, *kwargs)

	def loss(self, input, noise, sigma, **kwargs):
	c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
	noised_input = input + noise * K.utils.append_dims(sigma, input.ndim)
	eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
	return (eps - noise).pow(2).flatten(1).mean(1)

	def forward(self, input, sigma, **kwargs):
	c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]

	uncond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.uncond_emb, **kwargs)

	cond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.cond_emb, **kwargs)
	cond_eps = apply_unsharp_mask(cond_eps, sigma, self.cfg_scale, mix_factor=self.unsharp_mask_mix_factor)

	eps = uncond_eps + (cond_eps - uncond_eps) * self.cfg_scale
	return input + eps * c_out