Skip to content

Instantly share code, notes, and snippets.

@hinsonan
Last active April 25, 2024 02:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hinsonan/ac258f7c00b3205211ef0af261dd0ac2 to your computer and use it in GitHub Desktop.
Save hinsonan/ac258f7c00b3205211ef0af261dd0ac2 to your computer and use it in GitHub Desktop.
llama3-8b-instruct 4bit quantize
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from time import time
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
config = BitsAndBytesConfig(load_in_4bit=True,
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_compute_dtype = torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=config,
# torch_dtype=torch.bfloat16,
# max_memory={i: '6000MB' for i in range(torch.cuda.device_count())},
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a helpful decision aid that makes a decision if a scene is a marina, large dock, or airport"},
{"role": "user", "content": "I have an image that has 20 small boats and no large vessels in it. There is also a small biplane in the water. what type of scene is this image most likely describing"},
]
start = time()
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
outputs = model.generate(
input_ids.to(torch.float32).long(),
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))
print(time() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment