Skip to content

Instantly share code, notes, and snippets.

View kaiokendev's full-sized avatar
🎯
working, if you need help send email or DM

kaiokendev

🎯
working, if you need help send email or DM
View GitHub Profile
@ArthurZucker
ArthurZucker / static_kv_cache.py
Last active October 21, 2024 02:08
simple static kv cache script
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
device = "cuda"
# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
q = torch.empty_like(probs_sort).exponential_(1)
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)