Skip to content

Instantly share code, notes, and snippets.

@jph00
Created November 1, 2023 01:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jph00/11439aad1bfeee2ad785b2c27132775b to your computer and use it in GitHub Desktop.
Save jph00/11439aad1bfeee2ad785b2c27132775b to your computer and use it in GitHub Desktop.
Manual qlora inference example
import torch, time, os, safetensors
from torch import nn
from peft import get_peft_model, LoraConfig, TaskType
from bitsandbytes.nn import Linear4bit, Linear8bitLt
from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig, LlamaPreTrainedModel, BitsAndBytesConfig
from transformers.utils import hub, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME
from transformers.integrations.bitsandbytes import replace_with_bnb_linear
from transformers.modeling_utils import no_init_weights
from accelerate import init_empty_weights
from accelerate.utils import set_module_tensor_to_device
mid = "meta-llama/Llama-2-7b-hf"
# mid = "stabilityai/stablelm-3b-4e1t"
# mid = 'Phind/Phind-CodeLlama-34B-v2'
tokenizer = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
cfg = AutoConfig.from_pretrained(mid, trust_remote_code=True)
qcfg = BitsAndBytesConfig( load_in_8bit=True)
with init_empty_weights():
model = LlamaForCausalLM(cfg).eval()
model = replace_with_bnb_linear(model, quantization_config=qcfg)
model.is_loaded_in_8bit = True
idx = hub.cached_file(mid, SAFE_WEIGHTS_INDEX_NAME)
fns,maps = hub.get_checkpoint_shard_files(mid, idx)
for fn in fns:
sd = safetensors.torch.load_file(fn)
for n,p in sd.items(): set_module_tensor_to_device(model, n, 'cuda', value=p, dtype=torch.float16)
for n, buffer in model.named_buffers():
sn,_,bufn = n.rpartition('.')
setattr(model.get_submodule(sn), bufn, buffer.cuda())
tgt = [l+"_proj" for l in ["k", 'v', "q", "o", "gate", "up", "down"]]
peft_config = LoraConfig( r=8, lora_alpha=32, target_modules=tgt, bias="none",
task_type= "CAUSAL_LM", lora_dropout=0.05, inference_mode= False)
with no_init_weights(): model = get_peft_model(model, peft_config)
prompt = "Jeremy Howard is"
inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(**inputs.to('cuda'), max_length=30)[:, len(inputs['input_ids'][0]):]
print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment