Created
September 22, 2024 23:41
-
-
Save baronrabban/e03c203d7189444c1cab8bf0a8d03ed4 to your computer and use it in GitHub Desktop.
exllamav2 reproducer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache_TP, ExLlamaV2Tokenizer, Timer | |
from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler | |
model_dir = "/mnt/nlp/models/MistralLarge" | |
config = ExLlamaV2Config(model_dir) | |
config.arch_compat_overrides() | |
config.no_graphs = True | |
model = ExLlamaV2(config) | |
# Load the model in tensor-parallel mode. With no gpu_split specified, the model will attempt to split across | |
# all visible devices according to the currently available VRAM on each. expect_cache_tokens is necessary for | |
# balancing the split, in case the GPUs are of uneven sizes, or if the number of GPUs doesn't divide the number | |
# of KV heads in the model | |
# | |
# The cache type for a TP model is always ExLlamaV2Cache_TP and should be allocated after the model. To use a | |
# quantized cache, add a `base = ExLlamaV2Cache_Q6` etc. argument to the cache constructor. It's advisable | |
# to also add `expect_cache_base = ExLlamaV2Cache_Q6` to load_tp() as well so the size can be correctly | |
# accounted for when splitting the model. | |
model.load_tp(progress = True, expect_cache_tokens = 16384) | |
cache = ExLlamaV2Cache_TP(model, max_seq_len = 16384) | |
# After loading the model, all other functions should work the same | |
print("Loading tokenizer...") | |
tokenizer = ExLlamaV2Tokenizer(config) | |
# Initialize the generator with all default parameters | |
generator = ExLlamaV2DynamicGenerator( | |
model = model, | |
cache = cache, | |
tokenizer = tokenizer, | |
) | |
max_new_tokens = 500 | |
gen_settings = ExLlamaV2Sampler.Settings() | |
# defaults | |
gen_settings.temperature = 0.7 | |
# Disable all of these | |
gen_settings.token_repetition_penalty = 1.0 | |
gen_settings.top_k = 0 | |
gen_settings.top_p = 1 | |
print(gen_settings) | |
# Warmup generator. The function runs a small completion job to allow all the kernels to fully initialize and | |
# autotune before we do any timing measurements. It can be a little slow for larger models and is not needed | |
# to produce correct output. | |
generator.warmup() | |
# Generate one completion, using default settings | |
prompt = """Bob was a man looking for a job. Bob rounded the corner, keeping an eye out. As Bob rounded the corner he kept an eye out. | |
Write in the style of a story. Continue the story.""" | |
with Timer() as t_single: | |
output = generator.generate( | |
prompt = prompt, | |
max_new_tokens = max_new_tokens, | |
add_bos = True, | |
gen_settings = gen_settings | |
) | |
print("-----------------------------------------------------------------------------------") | |
print("- Single completion") | |
print("-----------------------------------------------------------------------------------") | |
print(output) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment