conceptofmind/multi_hop_and_search_tool.py

## multi_hop_and_search_tool.py
import os
import pandas as pd
from tqdm import tqdm
from IPython.display import display
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from utils import evaluate, ColBERTv2
from langchain.agents import initialize_agent, Tool, ZeroShotAgent, ConversationalAgent, AgentExecutor
from langchain import GoogleSearchAPIWrapper

my_api_key = ""
my_cse_id = ""
my_open_ai_key = ""
os.environ["GOOGLE_CSE_ID"] = my_cse_id
os.environ["GOOGLE_API_KEY"] = my_api_key
os.environ["OPENAI_API_KEY"] = my_open_ai_key

train = [('Who produced the album that included a re-recording of "Lithium"?', ['Butch Vig']),
         ('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', ['Kevin Greutert']),
         ('The heir to the Du Pont family fortune sponsored what wrestling team?', ['Foxcatcher', 'Team Foxcatcher', 'Foxcatcher Team']),
         ('In what year was the star of To Hell and Back born?', ['1925']),
         ('Which award did the first book of Gary Zukav receive?', ['U.S. National Book Award', 'National Book Award']),
         ('What city was the victim of Joseph Druces working in?', ['Boston, Massachusetts', 'Boston']),]

dev = [('Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?', ['E. L. Doctorow', 'E.L. Doctorow', 'Doctorow']),
       ('What documentary about the Gilgo Beach Killer debuted on A&E?', ['The Killing Season']),
       ('Right Back At It Again contains lyrics co-written by the singer born in what city?', ['Gainesville, Florida', 'Gainesville']),
       ('What year was the party of the winner of the 1971 San Francisco mayoral election founded?', ['1828']),
       ('Which author is English: John Braine or Studs Terkel?', ['John Braine']),
       ('Anthony Dirrell is the brother of which super middleweight title holder?', ['Andre Dirrell']),
       ('In which city is the sports nutrition business established by Oliver Cookson based ?', ['Cheshire', 'Cheshire, UK']),
       ('Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.', ['February 13, 1980']),
       ('Kyle Moran was born in the town on what river?', ['Castletown', 'Castletown River']),
       ("What is the name of one branch of Robert D. Braun's speciality?", ['aeronautical engineering', 'astronautical engineering', 'aeronautics', 'astronautics']),
       ("Where was the actress who played the niece in the Priest film born?", ['Surrey', 'Guildford, Surrey']),
       ('Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.', ['Portrait of a Marriage']),
       ('What year was the father of the Princes in the Tower born?', ['1442'])
       ]

# Initialize training and dev sets
train = [{'question': q, 'answer': a[0]} for q, a in train]
dev = [{'question': q, 'answers': a} for q, a in dev]

# Initialize LLM and retrieval model
llm = OpenAI(temperature=0)
rm = ColBERTv2('http://ec2-44-228-128-229.us-west-2.compute.amazonaws.com:8893/api/search')

# Google Search Tool for Custom Agent
search = GoogleSearchAPIWrapper()
tools = [
    Tool(
        name="Search",
        func=search.run,
        description="Search the web for relevant content",
    )
]
tool_names = [tool.name for tool in tools]

# First hop
search_retrieval_template_first_hop = """
Write a search query that will help answer a complex question.

---

Follow the following format.

Question: $[the question to be answered]
Rationale: Let's think step by step. To answer this question, we first need to find out $[the missing information]
Search Query: $[a simple question for seeking the missing information]

---

Question: {question}
Rationale: Let's think step by step. To answer this question, we first need to find out"""

first_hop_prompt = PromptTemplate(
    input_variables=['question'],
    template=search_retrieval_template_first_hop,
)

first_hop_chain = LLMChain(llm=llm, prompt=first_hop_prompt)

def format_context(context):
    """
    Format and enumerate a list of context strings for use in a prompt.
    """
    return '\n'.join([f'[{i+1}] {c}' for i, c in enumerate(context)])


def extract_last_line(completion, remove_prefix=True):
    """
    Extract the last line of a completion, optionally removing the prefix.
    """
    last_line = completion.split('\n')[-1].strip()
    if remove_prefix:
        last_line = last_line.split(':')[-1].strip()
    return last_line

# Followup hop
search_retrieval_template_followup_hop = """
Write a search query that will help answer a complex question.

---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]
Rationale: Let's think step by step. Based on the context, we have learned the following. $[information from the context that provides useful clues]
Search Query: $[a simple question for seeking the missing information]

---

Context:
{context}

Question: {question}
Rationale: Let's think step by step. Based on the context, we have learned the following."""

followup_hop_prompt = PromptTemplate(
    input_variables=['context', 'question'],
    template=search_retrieval_template_followup_hop,
)

followup_hop_chain = LLMChain(llm=llm, prompt=followup_hop_prompt)

# Use Custom ZeroShotAgent
prefix = """
Answer questions with short factoid answers. Feel free to ignore irrelevant information given in the questions. You have access to the following tools:
"""

suffix = """
---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]
Action: $[the action to take, should be one of $[Search]]
Action Input: $[the input to the action, should be a search query]
Thought: Let's think step by step. $[a step-by-step deduction that identifies the correct response, which will be provided below.]

Final Answer: $[a short factoid answer, often between 1 and 5 words]

---

Context:
{context}

Question: {question}

Thought: Let's think step by step.
{agent_scratchpad}
"""

rationale_prompt = FewShotPromptTemplate(
    examples=train,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=['context', 'question', 'agent_scratchpad'],
    example_separator=''
)

answer_chain = LLMChain(llm=llm, prompt=rationale_prompt)
answer_agent = ZeroShotAgent(llm_chain=answer_chain, allowed_tools=tool_names)
agent_executor = AgentExecutor.from_agent_and_tools(agent=answer_agent, tools=tools, verbose=True)

# Multihop chain with agent
def run_multihop_chain(question):
    context = []

    # Get first hop retrieval question and context
    first_hop_completion = first_hop_chain.run(question=question)
    retrieval_question_first_hop = extract_last_line(first_hop_completion)
    context.extend(rm(retrieval_question_first_hop, k=2))

    # Get second hop retrieval question and context
    second_hop_completion = followup_hop_chain.run(context=format_context(context), question=question)
    retrieval_question_second_hop = extract_last_line(second_hop_completion)

    context.extend(rm(retrieval_question_second_hop, k=2))

    # Get final answer
    print(format_context(context))
    final_completion = agent_executor.run(context=format_context(context), question=question)
    answer = extract_last_line(final_completion)

    return answer

# evaluate
evaluate(run_multihop_chain, dev)
	import os
	import pandas as pd
	from tqdm import tqdm
	from IPython.display import display
	from langchain.llms import OpenAI
	from langchain.prompts import PromptTemplate, FewShotPromptTemplate
	from langchain.chains import LLMChain
	from utils import evaluate, ColBERTv2
	from langchain.agents import initialize_agent, Tool, ZeroShotAgent, ConversationalAgent, AgentExecutor
	from langchain import GoogleSearchAPIWrapper

	my_api_key = ""
	my_cse_id = ""
	my_open_ai_key = ""
	os.environ["GOOGLE_CSE_ID"] = my_cse_id
	os.environ["GOOGLE_API_KEY"] = my_api_key
	os.environ["OPENAI_API_KEY"] = my_open_ai_key

	train = [('Who produced the album that included a re-recording of "Lithium"?', ['Butch Vig']),
	('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', ['Kevin Greutert']),
	('The heir to the Du Pont family fortune sponsored what wrestling team?', ['Foxcatcher', 'Team Foxcatcher', 'Foxcatcher Team']),
	('In what year was the star of To Hell and Back born?', ['1925']),
	('Which award did the first book of Gary Zukav receive?', ['U.S. National Book Award', 'National Book Award']),
	('What city was the victim of Joseph Druces working in?', ['Boston, Massachusetts', 'Boston']),]

	dev = [('Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?', ['E. L. Doctorow', 'E.L. Doctorow', 'Doctorow']),
	('What documentary about the Gilgo Beach Killer debuted on A&E?', ['The Killing Season']),
	('Right Back At It Again contains lyrics co-written by the singer born in what city?', ['Gainesville, Florida', 'Gainesville']),
	('What year was the party of the winner of the 1971 San Francisco mayoral election founded?', ['1828']),
	('Which author is English: John Braine or Studs Terkel?', ['John Braine']),
	('Anthony Dirrell is the brother of which super middleweight title holder?', ['Andre Dirrell']),
	('In which city is the sports nutrition business established by Oliver Cookson based ?', ['Cheshire', 'Cheshire, UK']),
	('Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.', ['February 13, 1980']),
	('Kyle Moran was born in the town on what river?', ['Castletown', 'Castletown River']),
	("What is the name of one branch of Robert D. Braun's speciality?", ['aeronautical engineering', 'astronautical engineering', 'aeronautics', 'astronautics']),
	("Where was the actress who played the niece in the Priest film born?", ['Surrey', 'Guildford, Surrey']),
	('Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.', ['Portrait of a Marriage']),
	('What year was the father of the Princes in the Tower born?', ['1442'])
	]

	# Initialize training and dev sets
	train = [{'question': q, 'answer': a[0]} for q, a in train]
	dev = [{'question': q, 'answers': a} for q, a in dev]

	# Initialize LLM and retrieval model
	llm = OpenAI(temperature=0)
	rm = ColBERTv2('http://ec2-44-228-128-229.us-west-2.compute.amazonaws.com:8893/api/search')

	# Google Search Tool for Custom Agent
	search = GoogleSearchAPIWrapper()
	tools = [
	Tool(
	name="Search",
	func=search.run,
	description="Search the web for relevant content",
	)
	]
	tool_names = [tool.name for tool in tools]

	# First hop
	search_retrieval_template_first_hop = """
	Write a search query that will help answer a complex question.

	---

	Follow the following format.

	Question: $[the question to be answered]
	Rationale: Let's think step by step. To answer this question, we first need to find out $[the missing information]
	Search Query: $[a simple question for seeking the missing information]

	---

	Question: {question}
	Rationale: Let's think step by step. To answer this question, we first need to find out"""

	first_hop_prompt = PromptTemplate(
	input_variables=['question'],
	template=search_retrieval_template_first_hop,
	)

	first_hop_chain = LLMChain(llm=llm, prompt=first_hop_prompt)

	def format_context(context):
	"""
	Format and enumerate a list of context strings for use in a prompt.
	"""
	return '\n'.join([f'[{i+1}] {c}' for i, c in enumerate(context)])


	def extract_last_line(completion, remove_prefix=True):
	"""
	Extract the last line of a completion, optionally removing the prefix.
	"""
	last_line = completion.split('\n')[-1].strip()
	if remove_prefix:
	last_line = last_line.split(':')[-1].strip()
	return last_line

	# Followup hop
	search_retrieval_template_followup_hop = """
	Write a search query that will help answer a complex question.

	---

	Follow the following format.

	Context:
	$[sources that may contain relevant content]

	Question: $[the question to be answered]
	Rationale: Let's think step by step. Based on the context, we have learned the following. $[information from the context that provides useful clues]
	Search Query: $[a simple question for seeking the missing information]

	---

	Context:
	{context}

	Question: {question}
	Rationale: Let's think step by step. Based on the context, we have learned the following."""

	followup_hop_prompt = PromptTemplate(
	input_variables=['context', 'question'],
	template=search_retrieval_template_followup_hop,
	)

	followup_hop_chain = LLMChain(llm=llm, prompt=followup_hop_prompt)

	# Use Custom ZeroShotAgent
	prefix = """
	Answer questions with short factoid answers. Feel free to ignore irrelevant information given in the questions. You have access to the following tools:
	"""

	suffix = """
	---

	Follow the following format.

	Context:
	$[sources that may contain relevant content]

	Question: $[the question to be answered]
	Action: $[the action to take, should be one of $[Search]]
	Action Input: $[the input to the action, should be a search query]
	Thought: Let's think step by step. $[a step-by-step deduction that identifies the correct response, which will be provided below.]

	Final Answer: $[a short factoid answer, often between 1 and 5 words]

	---

	Context:
	{context}

	Question: {question}

	Thought: Let's think step by step.
	{agent_scratchpad}
	"""

	rationale_prompt = FewShotPromptTemplate(
	examples=train,
	example_prompt=example_prompt,
	prefix=prefix,
	suffix=suffix,
	input_variables=['context', 'question', 'agent_scratchpad'],
	example_separator=''
	)

	answer_chain = LLMChain(llm=llm, prompt=rationale_prompt)
	answer_agent = ZeroShotAgent(llm_chain=answer_chain, allowed_tools=tool_names)
	agent_executor = AgentExecutor.from_agent_and_tools(agent=answer_agent, tools=tools, verbose=True)

	# Multihop chain with agent
	def run_multihop_chain(question):
	context = []

	# Get first hop retrieval question and context
	first_hop_completion = first_hop_chain.run(question=question)
	retrieval_question_first_hop = extract_last_line(first_hop_completion)
	context.extend(rm(retrieval_question_first_hop, k=2))

	# Get second hop retrieval question and context
	second_hop_completion = followup_hop_chain.run(context=format_context(context), question=question)
	retrieval_question_second_hop = extract_last_line(second_hop_completion)

	context.extend(rm(retrieval_question_second_hop, k=2))

	# Get final answer
	print(format_context(context))
	final_completion = agent_executor.run(context=format_context(context), question=question)
	answer = extract_last_line(final_completion)

	return answer

	# evaluate
	evaluate(run_multihop_chain, dev)