Skip to content

Instantly share code, notes, and snippets.

@Multihuntr
Created June 20, 2024 09:38
Show Gist options
  • Save Multihuntr/89afde471e7caa949b4a89653e896076 to your computer and use it in GitHub Desktop.
Save Multihuntr/89afde471e7caa949b4a89653e896076 to your computer and use it in GitHub Desktop.
Run Stable Diffusion 3 from commandline with limited resources.
name: sb3
channels:
- pytorch
- nvidia
- conda-forge
dependencies:
- diffusers
- transformers
- pytorch
- torchvision
- pytorch-cuda=12.1 # Note: I am using 12.2, but pytorch-cuda doesn't have a version for that; seems to work anyway
- accelerate
- sentencepiece
- protobuf
- pip:
# The conda-forge version wasn't registering my CUDA for some reason
- https://github.com/TimDettmers/bitsandbytes/releases/download/0.43.1/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
# Based off of: https://gist.github.com/sayakpaul/82acb5976509851f2db1a83456e504f1
# Adds argparser, splits out vae as separate step for VRAM savings.
# Removes performance monitoring.
# Making this a script for use, rather than measuring performance.
# Takes about 4 minutes per image using default settings for my GTX 1060
import argparse
import gc
from diffusers import StableDiffusion3Pipeline, SD3Transformer2DModel
from transformers import T5EncoderModel, BitsAndBytesConfig
import torch
def flush():
gc.collect()
torch.cuda.empty_cache()
def parse_args():
parser = argparse.ArgumentParser('Generate SB3-medium image using TB5 with 5GB VRAM and 12GB RAM')
parser.add_argument('prompt', type=str)
parser.add_argument('--neg_prompt', type=str, default=None)
parser.add_argument('--pooled_prompt', type=str, default=None)
parser.add_argument('--model_id', type=str, default="stabilityai/stable-diffusion-3-medium-diffusers")
parser.add_argument('--steps', type=int, default=28)
parser.add_argument('--num_generate', '-n', type=int, default=1)
parser.add_argument('--out_name', type=str, default='output')
return parser.parse_args()
def main(args):
for i in range(args.num_generate):
filename = f'{args.out_name}_{i}.png'
prompt_embeds = embed_prompt(args.model_id, args.prompt, args.neg_prompt, args.pooled_prompt)
generate(args.model_id, prompt_embeds, args.steps, filename)
def embed_prompt(model_id, prompt, neg_prompt, pooled_prompt):
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
# A little unnecessary, since the main cost is generating,
# but, might as well use it if we've got it
device_map = {
'shared': 0,
'encoder.embed_tokens': 0,
'encoder.block.0': 0,
'encoder.block.1': 0,
'encoder.block.2': 0,
'encoder.block.3': 0,
'encoder.block.4': 0,
'encoder.block.5': 0,
'encoder.block.6': 0,
'encoder.block.7': 0,
'encoder.block.8': 0,
'encoder.block.9': 'cpu',
'encoder.block.10': 'cpu',
'encoder.block.11': 'cpu',
'encoder.block.12': 'cpu',
'encoder.block.13': 'cpu',
'encoder.block.14': 'cpu',
'encoder.block.15': 'cpu',
'encoder.block.16': 'cpu',
'encoder.block.17': 'cpu',
'encoder.block.18': 'cpu',
'encoder.block.19': 'cpu',
'encoder.block.20': 'cpu',
'encoder.block.21': 'cpu',
'encoder.block.22': 'cpu',
'encoder.block.23': 0,
'encoder.final_layer_norm': 0,
'encoder.dropout': 0,
}
text_encoder = T5EncoderModel.from_pretrained(
model_id,
subfolder="text_encoder_3",
quantization_config=quantization_config,
device_map=device_map
)
pipeline = StableDiffusion3Pipeline.from_pretrained(
model_id,
text_encoder_3=text_encoder,
transformer=None,
vae=None,
device_map='balanced'
)
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = pipeline.encode_prompt(prompt=prompt, prompt_2=neg_prompt, prompt_3=pooled_prompt)
del text_encoder
del pipeline
flush()
return (
prompt_embeds.cuda().half(),
negative_prompt_embeds.cuda().half(),
pooled_prompt_embeds.cuda().half(),
negative_pooled_prompt_embeds.cuda().half(),
)
def generate(model_id, prompt_embeds, steps, filename):
pipeline = StableDiffusion3Pipeline.from_pretrained(
model_id,
text_encoder=None,
text_encoder_2=None,
text_encoder_3=None,
tokenizer=None,
tokenizer_2=None,
tokenizer_3=None,
torch_dtype=torch.float16
).to("cuda")
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = prompt_embeds
latents = pipeline(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
num_inference_steps=steps,
guidance_scale=5.0,
output_type='latent'
).images
# Don't have enough RAM to run VAE at the end, too :'(
# So, I'm copy-pasting the last few lines of the pipeline call (as discovered through inspect.getsource)
# deleting the transformer to clear space on the GPU before running the VAE
pipeline_vae = pipeline.vae
pipeline_image_processor = pipeline.image_processor
scale, shift = pipeline.vae.config.scaling_factor, pipeline.vae.config.shift_factor
del pipeline
flush()
latents = (latents / scale) + shift
images = pipeline_vae.decode(latents, return_dict=False)[0]
images = pipeline_image_processor.postprocess(images, output_type='pil')
images[0].save(filename)
del pipeline_vae
del pipeline_image_processor
flush()
if __name__ == '__main__':
with torch.no_grad():
main(parse_args())
@Multihuntr
Copy link
Author

Works on Ubuntu 22.04, GTX 1060, CUDA 12.2, Nvidia driver ver 535.183.01, miniforge (mamba).

Here's how I got my environment set up:

[If you don't have conda/mamba] Install miniforge:

curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
bash Miniforge3-$(uname)-$(uname -m).sh
~/miniforge3/bin/mamba init bash && ~/miniforge3/condabin/conda config --set auto_activate_base false

Restart terminal.

[If you don't have latest CUDA/graphics card drivers]: sudo ubuntu-drivers install

Create folder, copy files in.

Create environment: mamba install -p envs/sb3 --file environment.yml

Pass gate on HuggingFace so you can download the model:

  1. Go to: https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main, log in to hugging face and give your name and email.
  2. Use Huggingface cli to login: huggingface-cli login. Follow prompts. I gave my token only one permission: "Repos/Read access to contents of all public gated repos you can access".

Run script: python run_sb3.py "<enter your prompt here>".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment