sayakpaul/run_hunyuan_dit_less_memory.py

## run_hunyuan_dit_less_memory.py
"""
Make sure you have `diffusers`, `accelerate`, `transformers`, and `bitsandbytes` installed.

You also set up PyTorch and CUDA.

Once the dependencies are installed, you can run `python run_hunyuan_dit_less_memory.py`.
"""

from diffusers import HunyuanDiTPipeline
from transformers import T5EncoderModel
import torch
import gc


def flush():
    gc.collect()
    torch.cuda.empty_cache()

def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024


id = "Tencent-Hunyuan/HunyuanDiT-Diffusers"
text_encoder_2 = T5EncoderModel.from_pretrained(
    id,
    subfolder="text_encoder_2",
    load_in_8bit=True,
    device_map="auto",
)
pipeline = HunyuanDiTPipeline.from_pretrained(
    id,
    text_encoder_2=text_encoder_2,
    transformer=None,
    vae=None,
    torch_dtype=torch.float16,
    device_map="balanced",
)

with torch.no_grad():
    prompt = "一个宇航员在骑马"
    prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask = pipeline.encode_prompt(prompt)
    (
        prompt_embeds_2,
        negative_prompt_embeds_2,
        prompt_attention_mask_2,
        negative_prompt_attention_mask_2,
    ) = pipeline.encode_prompt(
        prompt=prompt,
        negative_prompt=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        prompt_attention_mask=None,
        negative_prompt_attention_mask=None,
        max_sequence_length=256,
        text_encoder_index=1,
    )


del text_encoder_2
del pipeline
flush()

pipe = HunyuanDiTPipeline.from_pretrained(
    id,
    text_encoder=None,
    text_encoder_2=None,
    torch_dtype=torch.float16,
).to("cuda")

image = pipe(
    negative_prompt=None,
    prompt_embeds=prompt_embeds,
    prompt_embeds_2=prompt_embeds_2,
    negative_prompt_embeds=negative_prompt_embeds,
    negative_prompt_embeds_2=negative_prompt_embeds_2,
    prompt_attention_mask=prompt_attention_mask,
    prompt_attention_mask_2=prompt_attention_mask_2,
    negative_prompt_attention_mask=negative_prompt_attention_mask,
    negative_prompt_attention_mask_2=negative_prompt_attention_mask_2,
    num_images_per_prompt=1,
).images[0]

print(
    f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB"
)
image.save("memory_optimized.png")
	"""
	Make sure you have `diffusers`, `accelerate`, `transformers`, and `bitsandbytes` installed.

	You also set up PyTorch and CUDA.

	Once the dependencies are installed, you can run `python run_hunyuan_dit_less_memory.py`.
	"""

	from diffusers import HunyuanDiTPipeline
	from transformers import T5EncoderModel
	import torch
	import gc


	def flush():
	gc.collect()
	torch.cuda.empty_cache()

	def bytes_to_giga_bytes(bytes):
	return bytes / 1024 / 1024 / 1024


	id = "Tencent-Hunyuan/HunyuanDiT-Diffusers"
	text_encoder_2 = T5EncoderModel.from_pretrained(
	id,
	subfolder="text_encoder_2",
	load_in_8bit=True,
	device_map="auto",
	)
	pipeline = HunyuanDiTPipeline.from_pretrained(
	id,
	text_encoder_2=text_encoder_2,
	transformer=None,
	vae=None,
	torch_dtype=torch.float16,
	device_map="balanced",
	)

	with torch.no_grad():
	prompt = "一个宇航员在骑马"
	prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask = pipeline.encode_prompt(prompt)
	(
	prompt_embeds_2,
	negative_prompt_embeds_2,
	prompt_attention_mask_2,
	negative_prompt_attention_mask_2,
	) = pipeline.encode_prompt(
	prompt=prompt,
	negative_prompt=None,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	prompt_attention_mask=None,
	negative_prompt_attention_mask=None,
	max_sequence_length=256,
	text_encoder_index=1,
	)


	del text_encoder_2
	del pipeline
	flush()

	pipe = HunyuanDiTPipeline.from_pretrained(
	id,
	text_encoder=None,
	text_encoder_2=None,
	torch_dtype=torch.float16,
	).to("cuda")

	image = pipe(
	negative_prompt=None,
	prompt_embeds=prompt_embeds,
	prompt_embeds_2=prompt_embeds_2,
	negative_prompt_embeds=negative_prompt_embeds,
	negative_prompt_embeds_2=negative_prompt_embeds_2,
	prompt_attention_mask=prompt_attention_mask,
	prompt_attention_mask_2=prompt_attention_mask_2,
	negative_prompt_attention_mask=negative_prompt_attention_mask,
	negative_prompt_attention_mask_2=negative_prompt_attention_mask_2,
	num_images_per_prompt=1,
	).images[0]

	print(
	f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB"
	)
	image.save("memory_optimized.png")