ahoho/prompt_alpaca_lora.py

## prompt_alpaca_lora.py
from typing import Optional, Any

import torch

from transformers.utils import is_accelerate_available, is_bitsandbytes_available
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    pipeline,
)

from peft import PeftModel

ALPACA_TEMPLATE = (
    "Below is an instruction that describes a task, paired with an input that provides "
    "further context. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)


def load_adapted_hf_generation_pipeline(
    base_model_name,
    lora_model_name,
    temperature: float = 0,
    top_p: float = 1.,
    max_tokens: int = 50,
    batch_size: int = 16,
    device: str = "cpu",
    load_in_8bit: bool = True,
    generation_kwargs: Optional[dict] = None,
):
    """
    Load a huggingface model & adapt with PEFT.
    Borrowed from https://github.com/tloen/alpaca-lora/blob/main/generate.py
    """

    if device == "cuda":
        if not is_accelerate_available():
            raise ValueError("Install `accelerate`")
    if load_in_8bit and not is_bitsandbytes_available():
            raise ValueError("Install `bitsandbytes`")

    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    task = "text-generation"

    if device == "cuda":
        model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            load_in_8bit=load_in_8bit,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model = PeftModel.from_pretrained(
            model,
            lora_model_name,
            torch_dtype=torch.float16,
        )
    elif device == "mps":
        model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
        model = PeftModel.from_pretrained(
            model,
            lora_model_name,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_name, device_map={"": device}, low_cpu_mem_usage=True
        )
        model = PeftModel.from_pretrained(
            model,
            lora_model_name,
            device_map={"": device},
        )

    # unwind broken decapoda-research config
    model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
    model.config.bos_token_id = 1
    model.config.eos_token_id = 2

    if not load_in_8bit:
        model.half()  # seems to fix bugs for some users.

    model.eval()

    generation_kwargs = generation_kwargs if generation_kwargs is not None else {}
    config = GenerationConfig(
        do_sample=True,
        temperature=temperature,
        max_new_tokens=max_tokens,
        top_p=top_p,
        **generation_kwargs,
    )
    pipe = pipeline(
        task,
        model=model,
        tokenizer=tokenizer,
        batch_size=16, # TODO: make a parameter
        generation_config=config,
        framework="pt",
    )

    return pipe

if __name__ == "__main__":
    pipe = load_adapted_hf_generation_pipeline(
    base_model_name="decapoda-research/llama-7b-hf",
    lora_model_name="tloen/alpaca-lora-7b",
    )
    prompt = ALPACA_TEMPLATE.format(
        instruction="Paraphrase the sentence.",
        input="The quick brown fox jumped over the lazy dog."
    )
    print(pipe(prompt))
	from typing import Optional, Any

	import torch

	from transformers.utils import is_accelerate_available, is_bitsandbytes_available
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	GenerationConfig,
	pipeline,
	)

	from peft import PeftModel

	ALPACA_TEMPLATE = (
	"Below is an instruction that describes a task, paired with an input that provides "
	"further context. Write a response that appropriately completes the request.\n\n"
	"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
	)


	def load_adapted_hf_generation_pipeline(
	base_model_name,
	lora_model_name,
	temperature: float = 0,
	top_p: float = 1.,
	max_tokens: int = 50,
	batch_size: int = 16,
	device: str = "cpu",
	load_in_8bit: bool = True,
	generation_kwargs: Optional[dict] = None,
	):
	"""
	Load a huggingface model & adapt with PEFT.
	Borrowed from https://github.com/tloen/alpaca-lora/blob/main/generate.py
	"""

	if device == "cuda":
	if not is_accelerate_available():
	raise ValueError("Install `accelerate`")
	if load_in_8bit and not is_bitsandbytes_available():
	raise ValueError("Install `bitsandbytes`")

	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	task = "text-generation"

	if device == "cuda":
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	load_in_8bit=load_in_8bit,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name,
	torch_dtype=torch.float16,
	)
	elif device == "mps":
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	else:
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name, device_map={"": device}, low_cpu_mem_usage=True
	)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name,
	device_map={"": device},
	)

	# unwind broken decapoda-research config
	model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
	model.config.bos_token_id = 1
	model.config.eos_token_id = 2

	if not load_in_8bit:
	model.half() # seems to fix bugs for some users.

	model.eval()

	generation_kwargs = generation_kwargs if generation_kwargs is not None else {}
	config = GenerationConfig(
	do_sample=True,
	temperature=temperature,
	max_new_tokens=max_tokens,
	top_p=top_p,
	**generation_kwargs,
	)
	pipe = pipeline(
	task,
	model=model,
	tokenizer=tokenizer,
	batch_size=16, # TODO: make a parameter
	generation_config=config,
	framework="pt",
	)

	return pipe

	if __name__ == "__main__":
	pipe = load_adapted_hf_generation_pipeline(
	base_model_name="decapoda-research/llama-7b-hf",
	lora_model_name="tloen/alpaca-lora-7b",
	)
	prompt = ALPACA_TEMPLATE.format(
	instruction="Paraphrase the sentence.",
	input="The quick brown fox jumped over the lazy dog."
	)
	print(pipe(prompt))