Skip to content

Instantly share code, notes, and snippets.

@JoaoLages
Last active October 8, 2023 17:21
Show Gist options
  • Save JoaoLages/461b3e0b05adf6f8cdc54365ae1ca824 to your computer and use it in GitHub Desktop.
Save JoaoLages/461b3e0b05adf6f8cdc54365ae1ca824 to your computer and use it in GitHub Desktop.
Comparing generation speed with and without KV caching
import numpy as np
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
for use_cache in (True, False):
times = []
for _ in range(10): # measuring 10 generations
start = time.time()
model.generate(**tokenizer("What is KV caching?", return_tensors="pt").to(device), use_cache=use_cache, max_new_tokens=1000)
times.append(time.time() - start)
print(f"{'with' if use_cache else 'without'} KV caching: {round(np.mean(times), 3)} +- {round(np.std(times), 3)} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment