mapmeld/llama2-langchain.py

## llama2-langchain.py
# this should run on a GPU CoLab notebook
# pip install langchain xformers transformers datasets bitsandbytes accelerate --quiet
# get access to the meta-llama models, accept license, and get a read token

hf_auth = '######'

from langchain.chains import ConversationChain
from langchain.llms import HuggingFacePipeline
from langchain.memory import ConversationSummaryBufferMemory
from langchain.prompts.prompt import PromptTemplate

from torch import cuda, bfloat16
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
import transformers


model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

DEFAULT_TEMPLATE = """<s>[INST] <<SYS>>
The following is a friendly conversation between a human and an AI on a serious space mission. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
<</SYS>>
{input} [/INST]"""
PROMPT = PromptTemplate(input_variables=["history", "input"], template=DEFAULT_TEMPLATE)

chain = ConversationChain(llm=llm, memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=100), prompt=PROMPT)

# checking again that everything is working fine
response = chain.predict(input="Explain the difference between someone's competence and someone's competency.")
print(response)

chain.predict(input="What did I just ask about?")
	# this should run on a GPU CoLab notebook
	# pip install langchain xformers transformers datasets bitsandbytes accelerate --quiet
	# get access to the meta-llama models, accept license, and get a read token

	hf_auth = '######'

	from langchain.chains import ConversationChain
	from langchain.llms import HuggingFacePipeline
	from langchain.memory import ConversationSummaryBufferMemory
	from langchain.prompts.prompt import PromptTemplate

	from torch import cuda, bfloat16
	import torch
	from transformers import StoppingCriteria, StoppingCriteriaList
	import transformers


	model_id = 'meta-llama/Llama-2-7b-chat-hf'

	device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

	# set quantization configuration to load large model with less GPU memory
	# this requires the `bitsandbytes` library
	bnb_config = transformers.BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=bfloat16
	)

	model_config = transformers.AutoConfig.from_pretrained(
	model_id,
	use_auth_token=hf_auth
	)

	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=model_config,
	quantization_config=bnb_config,
	device_map='auto',
	use_auth_token=hf_auth
	)
	model.eval()

	tokenizer = transformers.AutoTokenizer.from_pretrained(
	model_id,
	use_auth_token=hf_auth
	)

	stop_list = ['\nHuman:', '\n```\n']

	stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
	stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

	# define custom stopping criteria object
	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	for stop_ids in stop_token_ids:
	if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
	return True
	return False

	stopping_criteria = StoppingCriteriaList([StopOnTokens()])

	generate_text = transformers.pipeline(
	model=model,
	tokenizer=tokenizer,
	return_full_text=True, # langchain expects the full text
	task='text-generation',
	# we pass model parameters here too
	stopping_criteria=stopping_criteria, # without this model rambles during chat
	temperature=0.1, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
	max_new_tokens=512, # max number of tokens to generate in the output
	repetition_penalty=1.1 # without this output begins repeating
	)

	DEFAULT_TEMPLATE = """<s>[INST] <<SYS>>
	The following is a friendly conversation between a human and an AI on a serious space mission. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

	Current conversation:
	{history}
	<</SYS>>
	{input} [/INST]"""
	PROMPT = PromptTemplate(input_variables=["history", "input"], template=DEFAULT_TEMPLATE)

	chain = ConversationChain(llm=llm, memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=100), prompt=PROMPT)

	# checking again that everything is working fine
	response = chain.predict(input="Explain the difference between someone's competence and someone's competency.")
	print(response)

	chain.predict(input="What did I just ask about?")