anotherjesse/just-clip.py

## just-clip.py
!pip install diffusers==0.4.0
!pip install transformers  ftfy

from transformers import CLIPTokenizer, CLIPTextModel
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"  # or just let it be cpu

prompt = ["a photograph of an astronaut riding a horse"]

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

text_input = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)

text_input = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)

## text_input is a input_ids: tensor, attention_mask: tensor
## Q: is text_input input_ids always the same shape? (1, 77)


with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]

# text embeddings is a tensor of shape torch.Size([1, 77, 768])

max_length = text_input.input_ids.shape[-1]
batch_size = 1
uncond_input = tokenizer(
    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
)

with torch.no_grad():
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

# uncond_embeddings is torch.Size([1, 77, 768])

text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

# text_embeddings.shape => torch.Size([2, 77, 768])
	!pip install diffusers==0.4.0
	!pip install transformers ftfy

	from transformers import CLIPTokenizer, CLIPTextModel
	import torch

	torch_device = "cuda" if torch.cuda.is_available() else "cpu" # or just let it be cpu

	prompt = ["a photograph of an astronaut riding a horse"]

	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
	text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

	text_input = tokenizer(
	prompt,
	padding="max_length",
	max_length=tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	text_input = tokenizer(
	prompt,
	padding="max_length",
	max_length=tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	## text_input is a input_ids: tensor, attention_mask: tensor
	## Q: is text_input input_ids always the same shape? (1, 77)


	with torch.no_grad():
	text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]

	# text embeddings is a tensor of shape torch.Size([1, 77, 768])

	max_length = text_input.input_ids.shape[-1]
	batch_size = 1
	uncond_input = tokenizer(
	[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
	)

	with torch.no_grad():
	uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

	# uncond_embeddings is torch.Size([1, 77, 768])

	text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

	# text_embeddings.shape => torch.Size([2, 77, 768])