devinSpitz/alpaca_llama_index.py

## alpaca_llama_index.py
#logging
import logging;
logname = "log.txt";

logging.basicConfig(
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO,
                        handlers=[
        logging.FileHandler(logname),
        logging.StreamHandler()
    ])

# imports
import sys

import torch
from peft import PeftModel
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, AutoModelForTokenClassification
from langchain.llms import HuggingFacePipeline
from langchain.llms.base import LLM
from llama_index import SimpleDirectoryReader, LangchainEmbedding, GPTListIndex, PromptHelper, LLMPredictor, ServiceContext
from typing import Optional, List, Mapping, Any
from peft import PeftModel

from langchain.agents import load_tools
from langchain.agents import initialize_agent
#base_model="decapoda-research/llama-7b-hf";
#base_model="circulus/alpaca-7b";
base_model="circulus/alpaca-7b";
#base_model="decapoda-research/llama-13b-hf";
modelName = base_model;
# lora_weights = "chansung/alpaca-lora-13b";
# lora_weights = "baruga/alpaca-lora-13b";
# lora_weights = "mattreid/alpaca-lora-13b";
# lora_weights = "Dogge/alpaca-lora-13b";
# lora_weights = "circulus/alpaca-lora-13b";
# lora_weights = "daviddmc/lpaca-lora-13b";
#lora_weights = "circulus/alpaca-lora-7b";
#lora_weights = "tloen/alpaca-lora-7b";
lora_weights = "circulus/alpaca-lora-7b";
#lora_weights = "chansung/alpaca-lora-13b";

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

try:
    if torch.backends.mps.is_available():
        device = "mps"
except:  # noqa: E722
    pass

load_8bit = True;
tokenizer = LlamaTokenizer.from_pretrained(base_model)
print(device)
if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map={'': 0},
    )
    model = PeftModel.from_pretrained(
         model,
         lora_weights,
         torch_dtype=torch.float16,
     )
elif device == "mps":
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        device_map={"": device},
        torch_dtype=torch.float16,
    )
    model = PeftModel.from_pretrained(
         model,
         lora_weights,
         device_map={"": device},
         torch_dtype=torch.float16,
     )
else:
    model = LlamaForCausalLM.from_pretrained(
        base_model, device_map={"": device}, low_cpu_mem_usage=True
    )
    model = PeftModel.from_pretrained(
         model,
         lora_weights,
         device_map={"": device},
     )

# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
    model.half()  # seems to fix bugs for some users.

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)


#
# define prompt helper
# set maximum input size
max_input_size = 2048
# set number of output tokens
num_output = 200
# set maximum chunk overlap
max_chunk_overlap = 20

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2000,
    temperature=0.1,
    num_beams=1,
    top_p=0.95,
    repetition_penalty=1.2
 )

local_llm = HuggingFacePipeline(pipeline=pipe)


# tools = load_tools(["llm-math"],llm=local_llm)
# agend = initialize_agent(tools,llm=local_llm,agent="zero-shot-react-description",verbose=True)

prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
llm_predictor = LLMPredictor(llm=local_llm)

service_context = ServiceContext.from_defaults(
    llm_predictor, prompt_helper)
documents = SimpleDirectoryReader('./data').load_data()
index = GPTListIndex.from_documents(
    documents, service_context=service_context)


template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

Answer:"""


from langchain import PromptTemplate, LLMChain
prompt = PromptTemplate(template=template, input_variables=["instruction"])

llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm)


def evaluate(
    instruction
):
    instruction = template.replace("{instruction}",instruction)
    #return index.query(instruction)
    #return agend.run(instruction)
    logging.info("___________Promt:____________");
    logging.info(instruction);
    logging.info("_________Normal model____________");
    logging.info(llm_chain.run(instruction));
    logging.info("_________After Documents___________");
    logging.info(index.query(instruction));
    logging.info("______________________________");


evaluate("What do you think of Facebook's LLaMa?")
evaluate("How many people lives in Martos?")
evaluate("What is the capital of England?")
evaluate("What are alpacas? and how are they different from llamas?")
evaluate("how much is 213769*121239?")
	#logging
	import logging;
	logname = "log.txt";

	logging.basicConfig(
	format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
	datefmt='%H:%M:%S',
	level=logging.INFO,
	handlers=[
	logging.FileHandler(logname),
	logging.StreamHandler()
	])

	# imports
	import sys

	import torch
	from peft import PeftModel
	import transformers
	from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, AutoModelForTokenClassification
	from langchain.llms import HuggingFacePipeline
	from langchain.llms.base import LLM
	from llama_index import SimpleDirectoryReader, LangchainEmbedding, GPTListIndex, PromptHelper, LLMPredictor, ServiceContext
	from typing import Optional, List, Mapping, Any
	from peft import PeftModel

	from langchain.agents import load_tools
	from langchain.agents import initialize_agent
	#base_model="decapoda-research/llama-7b-hf";
	#base_model="circulus/alpaca-7b";
	base_model="circulus/alpaca-7b";
	#base_model="decapoda-research/llama-13b-hf";
	modelName = base_model;
	# lora_weights = "chansung/alpaca-lora-13b";
	# lora_weights = "baruga/alpaca-lora-13b";
	# lora_weights = "mattreid/alpaca-lora-13b";
	# lora_weights = "Dogge/alpaca-lora-13b";
	# lora_weights = "circulus/alpaca-lora-13b";
	# lora_weights = "daviddmc/lpaca-lora-13b";
	#lora_weights = "circulus/alpaca-lora-7b";
	#lora_weights = "tloen/alpaca-lora-7b";
	lora_weights = "circulus/alpaca-lora-7b";
	#lora_weights = "chansung/alpaca-lora-13b";

	if torch.cuda.is_available():
	device = "cuda"
	else:
	device = "cpu"

	try:
	if torch.backends.mps.is_available():
	device = "mps"
	except: # noqa: E722
	pass

	load_8bit = True;
	tokenizer = LlamaTokenizer.from_pretrained(base_model)
	print(device)
	if device == "cuda":
	model = LlamaForCausalLM.from_pretrained(
	base_model,
	load_in_8bit=load_8bit,
	torch_dtype=torch.float16,
	device_map={'': 0},
	)
	model = PeftModel.from_pretrained(
	model,
	lora_weights,
	torch_dtype=torch.float16,
	)
	elif device == "mps":
	model = LlamaForCausalLM.from_pretrained(
	base_model,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	model = PeftModel.from_pretrained(
	model,
	lora_weights,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	else:
	model = LlamaForCausalLM.from_pretrained(
	base_model, device_map={"": device}, low_cpu_mem_usage=True
	)
	model = PeftModel.from_pretrained(
	model,
	lora_weights,
	device_map={"": device},
	)

	# unwind broken decapoda-research config
	model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
	model.config.bos_token_id = 1
	model.config.eos_token_id = 2

	if not load_8bit:
	model.half() # seems to fix bugs for some users.

	model.eval()
	if torch.__version__ >= "2" and sys.platform != "win32":
	model = torch.compile(model)


	#
	# define prompt helper
	# set maximum input size
	max_input_size = 2048
	# set number of output tokens
	num_output = 200
	# set maximum chunk overlap
	max_chunk_overlap = 20

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=2000,
	temperature=0.1,
	num_beams=1,
	top_p=0.95,
	repetition_penalty=1.2
	)

	local_llm = HuggingFacePipeline(pipeline=pipe)


	# tools = load_tools(["llm-math"],llm=local_llm)
	# agend = initialize_agent(tools,llm=local_llm,agent="zero-shot-react-description",verbose=True)

	prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
	llm_predictor = LLMPredictor(llm=local_llm)

	service_context = ServiceContext.from_defaults(
	llm_predictor, prompt_helper)
	documents = SimpleDirectoryReader('./data').load_data()
	index = GPTListIndex.from_documents(
	documents, service_context=service_context)


	template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

	### Instruction:
	{instruction}

	Answer:"""


	from langchain import PromptTemplate, LLMChain
	prompt = PromptTemplate(template=template, input_variables=["instruction"])

	llm_chain = LLMChain(prompt=prompt,
	llm=local_llm)


	def evaluate(
	instruction
	):
	instruction = template.replace("{instruction}",instruction)
	#return index.query(instruction)
	#return agend.run(instruction)
	logging.info("___________Promt:____________");
	logging.info(instruction);
	logging.info("_________Normal model____________");
	logging.info(llm_chain.run(instruction));
	logging.info("_________After Documents___________");
	logging.info(index.query(instruction));
	logging.info("______________________________");


	evaluate("What do you think of Facebook's LLaMa?")
	evaluate("How many people lives in Martos?")
	evaluate("What is the capital of England?")
	evaluate("What are alpacas? and how are they different from llamas?")
	evaluate("how much is 213769*121239?")