jph00/qlora.py

## qlora.py
import torch, time, os, safetensors
from torch import nn

from peft import get_peft_model, LoraConfig, TaskType
from bitsandbytes.nn import Linear4bit, Linear8bitLt

from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig, LlamaPreTrainedModel, BitsAndBytesConfig
from transformers.utils import hub, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME

from transformers.integrations.bitsandbytes import replace_with_bnb_linear
from transformers.modeling_utils import no_init_weights
from accelerate import init_empty_weights
from accelerate.utils import set_module_tensor_to_device

mid = "meta-llama/Llama-2-7b-hf"
# mid = "stabilityai/stablelm-3b-4e1t"
# mid = 'Phind/Phind-CodeLlama-34B-v2'

tokenizer = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
cfg = AutoConfig.from_pretrained(mid, trust_remote_code=True)

qcfg = BitsAndBytesConfig( load_in_8bit=True)

with init_empty_weights():
    model = LlamaForCausalLM(cfg).eval()
    model = replace_with_bnb_linear(model, quantization_config=qcfg)
model.is_loaded_in_8bit = True

idx = hub.cached_file(mid, SAFE_WEIGHTS_INDEX_NAME)
fns,maps = hub.get_checkpoint_shard_files(mid, idx)

for fn in fns:
    sd = safetensors.torch.load_file(fn)
    for n,p in sd.items(): set_module_tensor_to_device(model, n, 'cuda', value=p, dtype=torch.float16)

for n, buffer in model.named_buffers():
    sn,_,bufn = n.rpartition('.')
    setattr(model.get_submodule(sn), bufn, buffer.cuda())

tgt = [l+"_proj" for l in ["k", 'v', "q", "o", "gate", "up", "down"]]
peft_config = LoraConfig( r=8, lora_alpha=32, target_modules=tgt, bias="none",
   task_type= "CAUSAL_LM", lora_dropout=0.05, inference_mode= False)

with no_init_weights(): model = get_peft_model(model, peft_config)

prompt = "Jeremy Howard is"
inputs = tokenizer(prompt, return_tensors="pt")

generate_ids = model.generate(**inputs.to('cuda'), max_length=30)[:, len(inputs['input_ids'][0]):]
print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
	import torch, time, os, safetensors
	from torch import nn

	from peft import get_peft_model, LoraConfig, TaskType
	from bitsandbytes.nn import Linear4bit, Linear8bitLt

	from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig, LlamaPreTrainedModel, BitsAndBytesConfig
	from transformers.utils import hub, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME

	from transformers.integrations.bitsandbytes import replace_with_bnb_linear
	from transformers.modeling_utils import no_init_weights
	from accelerate import init_empty_weights
	from accelerate.utils import set_module_tensor_to_device

	mid = "meta-llama/Llama-2-7b-hf"
	# mid = "stabilityai/stablelm-3b-4e1t"
	# mid = 'Phind/Phind-CodeLlama-34B-v2'

	tokenizer = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
	tokenizer.pad_token_id = tokenizer.eos_token_id
	cfg = AutoConfig.from_pretrained(mid, trust_remote_code=True)

	qcfg = BitsAndBytesConfig( load_in_8bit=True)

	with init_empty_weights():
	model = LlamaForCausalLM(cfg).eval()
	model = replace_with_bnb_linear(model, quantization_config=qcfg)
	model.is_loaded_in_8bit = True

	idx = hub.cached_file(mid, SAFE_WEIGHTS_INDEX_NAME)
	fns,maps = hub.get_checkpoint_shard_files(mid, idx)

	for fn in fns:
	sd = safetensors.torch.load_file(fn)
	for n,p in sd.items(): set_module_tensor_to_device(model, n, 'cuda', value=p, dtype=torch.float16)

	for n, buffer in model.named_buffers():
	sn,_,bufn = n.rpartition('.')
	setattr(model.get_submodule(sn), bufn, buffer.cuda())

	tgt = [l+"_proj" for l in ["k", 'v', "q", "o", "gate", "up", "down"]]
	peft_config = LoraConfig( r=8, lora_alpha=32, target_modules=tgt, bias="none",
	task_type= "CAUSAL_LM", lora_dropout=0.05, inference_mode= False)

	with no_init_weights(): model = get_peft_model(model, peft_config)

	prompt = "Jeremy Howard is"
	inputs = tokenizer(prompt, return_tensors="pt")

	generate_ids = model.generate(**inputs.to('cuda'), max_length=30)[:, len(inputs['input_ids'][0]):]
	print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])