baronrabban/gist:e03c203d7189444c1cab8bf0a8d03ed4 Secret

## gistfile1.txt
import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache_TP, ExLlamaV2Tokenizer, Timer
from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler

model_dir = "/mnt/nlp/models/MistralLarge"
config = ExLlamaV2Config(model_dir)
config.arch_compat_overrides()
config.no_graphs = True
model = ExLlamaV2(config)

# Load the model in tensor-parallel mode. With no gpu_split specified, the model will attempt to split across
# all visible devices according to the currently available VRAM on each. expect_cache_tokens is necessary for
# balancing the split, in case the GPUs are of uneven sizes, or if the number of GPUs doesn't divide the number
# of KV heads in the model
#
# The cache type for a TP model is always ExLlamaV2Cache_TP and should be allocated after the model. To use a
# quantized cache, add a `base = ExLlamaV2Cache_Q6` etc. argument to the cache constructor. It's advisable
# to also add `expect_cache_base = ExLlamaV2Cache_Q6` to load_tp() as well so the size can be correctly
# accounted for when splitting the model.

model.load_tp(progress = True, expect_cache_tokens = 16384)
cache = ExLlamaV2Cache_TP(model, max_seq_len = 16384)

# After loading the model, all other functions should work the same

print("Loading tokenizer...")
tokenizer = ExLlamaV2Tokenizer(config)

# Initialize the generator with all default parameters

generator = ExLlamaV2DynamicGenerator(
    model = model,
    cache = cache,
    tokenizer = tokenizer,

)

max_new_tokens = 500
gen_settings = ExLlamaV2Sampler.Settings()

# defaults
gen_settings.temperature = 0.7

# Disable all of these
gen_settings.token_repetition_penalty = 1.0
gen_settings.top_k = 0
gen_settings.top_p = 1


print(gen_settings)


# Warmup generator. The function runs a small completion job to allow all the kernels to fully initialize and
# autotune before we do any timing measurements. It can be a little slow for larger models and is not needed
# to produce correct output.

generator.warmup()

# Generate one completion, using default settings

prompt = """Bob was a man looking for a job.  Bob rounded the corner, keeping an eye out.  As Bob rounded the corner he kept an eye out.

Write in the style of a story.  Continue the story."""

with Timer() as t_single:
    output = generator.generate(
        prompt = prompt,
        max_new_tokens = max_new_tokens,
        add_bos = True,
        gen_settings = gen_settings
    )

print("-----------------------------------------------------------------------------------")
print("- Single completion")
print("-----------------------------------------------------------------------------------")
print(output)
print()
	import sys, os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache_TP, ExLlamaV2Tokenizer, Timer
	from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler

	model_dir = "/mnt/nlp/models/MistralLarge"
	config = ExLlamaV2Config(model_dir)
	config.arch_compat_overrides()
	config.no_graphs = True
	model = ExLlamaV2(config)

	# Load the model in tensor-parallel mode. With no gpu_split specified, the model will attempt to split across
	# all visible devices according to the currently available VRAM on each. expect_cache_tokens is necessary for
	# balancing the split, in case the GPUs are of uneven sizes, or if the number of GPUs doesn't divide the number
	# of KV heads in the model
	#
	# The cache type for a TP model is always ExLlamaV2Cache_TP and should be allocated after the model. To use a
	# quantized cache, add a `base = ExLlamaV2Cache_Q6` etc. argument to the cache constructor. It's advisable
	# to also add `expect_cache_base = ExLlamaV2Cache_Q6` to load_tp() as well so the size can be correctly
	# accounted for when splitting the model.

	model.load_tp(progress = True, expect_cache_tokens = 16384)
	cache = ExLlamaV2Cache_TP(model, max_seq_len = 16384)

	# After loading the model, all other functions should work the same

	print("Loading tokenizer...")
	tokenizer = ExLlamaV2Tokenizer(config)

	# Initialize the generator with all default parameters

	generator = ExLlamaV2DynamicGenerator(
	model = model,
	cache = cache,
	tokenizer = tokenizer,

	)

	max_new_tokens = 500
	gen_settings = ExLlamaV2Sampler.Settings()

	# defaults
	gen_settings.temperature = 0.7

	# Disable all of these
	gen_settings.token_repetition_penalty = 1.0
	gen_settings.top_k = 0
	gen_settings.top_p = 1


	print(gen_settings)


	# Warmup generator. The function runs a small completion job to allow all the kernels to fully initialize and
	# autotune before we do any timing measurements. It can be a little slow for larger models and is not needed
	# to produce correct output.

	generator.warmup()

	# Generate one completion, using default settings

	prompt = """Bob was a man looking for a job. Bob rounded the corner, keeping an eye out. As Bob rounded the corner he kept an eye out.

	Write in the style of a story. Continue the story."""

	with Timer() as t_single:
	output = generator.generate(
	prompt = prompt,
	max_new_tokens = max_new_tokens,
	add_bos = True,
	gen_settings = gen_settings
	)

	print("-----------------------------------------------------------------------------------")
	print("- Single completion")
	print("-----------------------------------------------------------------------------------")
	print(output)
	print()