Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active February 3, 2024 00:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/930f0a65fe9554a517dcf675db0c0134 to your computer and use it in GitHub Desktop.
Save shawngraham/930f0a65fe9554a517dcf675db0c0134 to your computer and use it in GitHub Desktop.
adding openalex search to GPT-Researcher https://github.com/assafelovic/gpt-researcher/; aslo point the main config file to OpenAlexSearch
from .tavily_search.tavily_search import TavilySearch
from .tavily_news.tavily_news import TavilyNews
from .duckduckgo.duckduckgo import Duckduckgo
from .google.google import GoogleSearch
from .serper.serper import SerperSearch
from .serpapi.serpapi import SerpApiSearch
from .searx.searx import SearxSearch
from .bing.bing import BingSearch
from .openalex.openalex import OpenAlexSearch
from .opencontext.opencontext import OpenContextSearch
from .chroniclingamerica.chroniclingamerica import ChroniclingAmericaSearch
__all__ = [
"TavilySearch",
"TavilyNews",
"Duckduckgo",
"SerperSearch",
"SerpApiSearch",
"GoogleSearch",
"SearxSearch",
"BingSearch",
"OpenAlexSearch",
"OpenContextSearch",
"ChroniclingAmericaSearch"
]
import requests
class ChroniclingAmericaSearch():
"""
Chronicling America Search Retriever
"""
def __init__(self, query):
"""
Initializes the ChroniclingAmericaSearch object
Args:
query: The search query (topic of interest)
"""
self.query = query
def search(self, max_results=10):
"""
Searches the Chronicling America API for articles related to the query
and retrieves article metadata along with the full text.
Args:
max_results: The maximum number of results to retrieve
Returns:
A list of dictionaries containing the 'href' (URL) and 'text' (full article text)
"""
print(f"Searching Chronicling America for '{self.query}'...")
url = "https://chroniclingamerica.loc.gov/search/pages/results/"
params = {
'proxtext': self.query,
'format': 'json',
'rows': max_results
}
resp = requests.get(url, params=params)
if resp.status_code != 200:
print(f"Failed to retrieve data: HTTP {resp.status_code}")
return []
search_response = []
results = resp.json().get('items', [])
for result in results:
# Extracting required details from the result
content = {
"title": result.get('title_normal', ""),
"country": result.get('country', ""),
"ocr_eng": result.get('ocr_eng', "")
}
article_url = result.get('url', "No URL provided").replace('.json', '/ocr.txt')
# Preparing the final data structure
transformed_result = {
"href": article_url,
"body": content
}
search_response.append(transformed_result)
return search_response
import asyncio
from gpt_researcher.utils.llm import *
from gpt_researcher.scraper import Scraper
from gpt_researcher.master.prompts import *
import json
def get_retriever(retriever):
"""
Gets the retriever
Args:
retriever: retriever name
Returns:
retriever: Retriever class
"""
match retriever:
case "tavily":
from gpt_researcher.retrievers import TavilySearch
retriever = TavilySearch
case "tavily_news":
from gpt_researcher.retrievers import TavilyNews
retriever = TavilyNews
case "google":
from gpt_researcher.retrievers import GoogleSearch
retriever = GoogleSearch
case "searx":
from gpt_researcher.retrievers import SearxSearch
retriever = SearxSearch
case "serpapi":
raise NotImplementedError("SerpApiSearch is not fully implemented yet.")
from gpt_researcher.retrievers import SerpApiSearch
retriever = SerpApiSearch
case "googleSerp":
from gpt_researcher.retrievers import SerperSearch
retriever = SerperSearch
case "duckduckgo":
from gpt_researcher.retrievers import Duckduckgo
retriever = Duckduckgo
case "BingSearch":
from gpt_researcher.retrievers import BingSearch
retriever = BingSearch
case "OpenAlexSearch":
from gpt_researcher.retrievers import OpenAlexSearch
retriever = OpenAlexSearch
case "OpenContextSearch":
from gpt_researcher.retrievers import OpenContextSearch
retriever = OpenContextSearch
case "ChroniclingAmericaSearch":
from gpt_researcher.retrievers import ChroniclingAmericaSearch
retriever = ChroniclingAmericaSearch
case _:
raise Exception("Retriever not found.")
return retriever
async def choose_agent(query, cfg):
"""
Chooses the agent automatically
Args:
query: original query
cfg: Config
Returns:
agent: Agent name
agent_role_prompt: Agent role prompt
"""
try:
response = await create_chat_completion(
model=cfg.smart_llm_model,
messages=[
{"role": "system", "content": f"{auto_agent_instructions()}"},
{"role": "user", "content": f"task: {query}"}],
temperature=0,
llm_provider=cfg.llm_provider
)
agent_dict = json.loads(response)
return agent_dict["server"], agent_dict["agent_role_prompt"]
except Exception as e:
return "Default Agent", "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."
async def get_sub_queries(query, agent_role_prompt, cfg):
"""
Gets the sub queries
Args:
query: original query
agent_role_prompt: agent role prompt
cfg: Config
Returns:
sub_queries: List of sub queries
"""
max_research_iterations = cfg.max_iterations if cfg.max_iterations else 1
response = await create_chat_completion(
model=cfg.smart_llm_model,
messages=[
{"role": "system", "content": f"{agent_role_prompt}"},
{"role": "user", "content": generate_search_queries_prompt(query, max_iterations=max_research_iterations)}],
temperature=0,
llm_provider=cfg.llm_provider
)
sub_queries = json.loads(response)
return sub_queries
def scrape_urls(urls, cfg=None):
"""
Scrapes the urls
Args:
urls: List of urls
cfg: Config (optional)
Returns:
text: str
"""
content = []
user_agent = cfg.user_agent if cfg else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
try:
content = Scraper(urls, user_agent).run()
except Exception as e:
print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
return content
async def summarize(query, content, agent_role_prompt, cfg, websocket=None):
"""
Asynchronously summarizes a list of URLs.
Args:
query (str): The search query.
content (list): List of dictionaries with 'url' and 'raw_content'.
agent_role_prompt (str): The role prompt for the agent.
cfg (object): Configuration object.
Returns:
list: A list of dictionaries with 'url' and 'summary'.
"""
# Function to handle each summarization task for a chunk
async def handle_task(url, chunk):
summary = await summarize_url(query, chunk, agent_role_prompt, cfg)
if summary:
await stream_output("logs", f"🌐 Summarizing url: {url}", websocket)
await stream_output("logs", f"📃 {summary}", websocket)
return url, summary
# Function to split raw content into chunks of 10,000 words
def chunk_content(raw_content, chunk_size=10000):
words = raw_content.split()
for i in range(0, len(words), chunk_size):
yield ' '.join(words[i:i+chunk_size])
# Process each item one by one, but process chunks in parallel
concatenated_summaries = []
for item in content:
url = item['url']
raw_content = item['raw_content']
# Create tasks for all chunks of the current URL
chunk_tasks = [handle_task(url, chunk) for chunk in chunk_content(raw_content)]
# Run chunk tasks concurrently
chunk_summaries = await asyncio.gather(*chunk_tasks)
# Aggregate and concatenate summaries for the current URL
summaries = [summary for _, summary in chunk_summaries if summary]
concatenated_summary = ' '.join(summaries)
concatenated_summaries.append({'url': url, 'summary': concatenated_summary})
return concatenated_summaries
async def summarize_url(query, raw_data, agent_role_prompt, cfg):
"""
Summarizes the text
Args:
query:
raw_data:
agent_role_prompt:
cfg:
Returns:
summary: str
"""
summary = ""
try:
summary = await create_chat_completion(
model=cfg.fast_llm_model,
messages=[
{"role": "system", "content": f"{agent_role_prompt}"},
{"role": "user", "content": f"{generate_summary_prompt(query, raw_data)}"}],
temperature=0,
llm_provider=cfg.llm_provider
)
except Exception as e:
print(f"{Fore.RED}Error in summarize: {e}{Style.RESET_ALL}")
return summary
async def generate_report(query, context, agent_role_prompt, report_type, websocket, cfg):
"""
generates the final report
Args:
query:
context:
agent_role_prompt:
report_type:
websocket:
cfg:
Returns:
report:
"""
generate_prompt = get_report_by_type(report_type)
report = ""
try:
report = await create_chat_completion(
model=cfg.smart_llm_model,
messages=[
{"role": "system", "content": f"{agent_role_prompt}"},
{"role": "user", "content": f"{generate_prompt(query, context, cfg.report_format, cfg.total_words)}"}],
temperature=0,
llm_provider=cfg.llm_provider,
stream=True,
websocket=websocket,
max_tokens=cfg.smart_token_limit
)
except Exception as e:
print(f"{Fore.RED}Error in generate_report: {e}{Style.RESET_ALL}")
return report
async def stream_output(type, output, websocket=None, logging=True):
"""
Streams output to the websocket
Args:
type:
output:
Returns:
None
"""
if not websocket or logging:
print(output)
if websocket:
await websocket.send_json({"type": type, "output": output})
# OpenAlex Search Retriever
import requests
import os
class OpenAlexSearch():
"""
OpenAlex Search Retriever
"""
def __init__(self, query):
"""
Initializes the OpenAlexSearch object
Args:
query: The search query (topic of interest)
"""
self.query = query
def search(self, max_results=10):
"""
Searches the OpenAlex API for works related to the query
Args:
max_results: The maximum number of results to retrieve
Returns:
A list of dictionary objects containing information about each work
"""
print(f"Searching OpenAlex for '{self.query}'...")
url = "https://api.openalex.org/works"
params = {
'filter': f'title.search:{self.query}',
'per-page': max_results
}
resp = requests.get(url, params=params)
if resp.status_code != 200:
print(f"Failed to retrieve data: HTTP {resp.status_code}")
return None
try:
search_results = resp.json()
except Exception as e:
print(f"Failed to parse the response: {e}")
return None
results = search_results.get("results", [])
search_response = []
for result in results:
# Collect all keyword and concepts strings
keywords_list = [kw['keyword'] for kw in result.get('keywords', [])]
concepts_list = [cl['display_name'] for cl in result.get('concepts', [])]
# Assembling the content/body part
content = {
"title": result['display_name'],
"year": result.get('publication_year', "No year provided"),
"author": result['authorships'][0]['raw_author_name'] if result.get('authorships') else "No author provided",
"concepts": concepts_list,
"keywords": keywords_list
}
# Final transformation
transformed_result = {
"href": result.get('doi', "No DOI provided"),
"body": content
}
search_response.append(transformed_result)
return search_response
import requests
class OpenContextSearch():
"""
Open Context Search Retriever
"""
def __init__(self, query):
"""
Initializes the OpenContextSearch object
Args:
query: The search query (topic of interest)
"""
self.query = query
def search(self, max_results=200):
"""
Searches the Open ContextSearch for articles related to the query
and retrieves article metadata.
Returns:
A list of dictionaries containing the 'href' (URL) and 'body' (metadata)
"""
print(f"Searching Open Context for '{self.query}'...")
url = "https://staging.opencontext.org/query/.json?"
params = {'q': self.query}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
resp = requests.get(url, params=params, headers=headers)
print(f"here it is '{resp.url}'")
if resp.status_code != 200:
print(f"Failed to retrieve data: HTTP {resp.status_code}")
return []
search_response = []
results = resp.json().get('features', []) # Direct access to 'features' based on your JSON structure
for feature in results: # Iterate directly through each 'feature' in the 'features'
content = {
#"id": feature.get('id', ""),
"label": feature.get('label', ""),
"when-start": feature.get('when', {}).get('start', ""),
"when-stop": feature.get('when', {}).get('stop', ""),
"project": feature.get('properties', {}).get('project label', ""),
"context-label": feature.get('properties', {}).get('context label', ""),
}
article_url = feature.get('properties', {}).get('uri', "No URL provided") # Correcting how the URL is accessed
if article_url !="No URL provide":
transformed_result = {
"href": article_url + ".json",
"body": content
}
search_response.append(transformed_result)
print(search_response)
return search_response
from datetime import datetime
def generate_search_queries_prompt(question, max_iterations=3):
""" Generates the search queries prompt for the given question.
Args: question (str): The question to generate the search queries prompt for
Returns: str: The search queries prompt for the given question
"""
return f'Write {max_iterations} google search queries to search online that form an objective opinion from the following: "{question}"' \
f'The FIRST QUERY MUST capture just the main NOUN or VERBAL PHRASE.\n' \
f'Use the current date if needed: {datetime.now().strftime("%B %d, %Y")}.\n' \
f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3"].\n' \
def generate_report_prompt(question, context, report_format="apa", total_words=1000):
""" Generates the report prompt for the given question and research summary.
Args: question (str): The question to generate the report prompt for
research_summary (str): The research summary to generate the report prompt for
Returns: str: The report prompt for the given question and research summary
"""
return f'Information: """{context}"""\n\n' \
f'IF there is any information above, answer the following' \
f' query or task: "{question}" in a detailed report. Otherwise write "Not enough data, refine your questions." --' \
" The report should focus on the answer to the query, should be well structured, informative," \
f" in depth and comprehensive, with facts and numbers if available and a minimum of {total_words} words.\n" \
"You should strive to write the report as long as you can using all relevant and necessary information provided.\n" \
"You must write the report with markdown syntax.\n " \
f"Use an unbiased and journalistic tone. \n" \
"You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.\n" \
f"You MUST write all used source urls at the end of the report as references, and make sure to not add duplicated sources, but only one reference for each.\n" \
f"You MUST write the report in {report_format} format.\n " \
f"YOU MUST CITE search results using inline notations. Only cite the most \
relevant results that answer the query accurately. YOU MUST place these citations at the end \
of the sentence or paragraph that reference them.\n"\
f"Let's think this through step by step. Please do your best, this is very important to my career. " \
f"Assume that the current date is {datetime.now().strftime('%B %d, %Y')}"
def generate_resource_report_prompt(question, context, report_format="apa", total_words=1000):
"""Generates the resource report prompt for the given question and research summary.
Args:
question (str): The question to generate the resource report prompt for.
context (str): The research summary to generate the resource report prompt for.
Returns:
str: The resource report prompt for the given question and research summary.
"""
return f'"""{context}"""\n\nIf there is any information above, use the above information to generate a bibliography recommendation report for the following' \
f' question or topic: "{question}". Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
' explaining how each source can contribute to finding answers to the research question.\n' \
'Focus on the relevance, reliability, and significance of each source.\n' \
'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
'Include relevant facts, figures, and numbers whenever available.\n' \
'The report should have a minimum length of 700 words.\n' \
'You MUST include all relevant source urls.\n' \
'Think this through step by step. Please do your best, this is very important to my career.'
def generate_custom_report_prompt(query_prompt, context, report_format="apa", total_words=1000):
return f'"{context}"\n\n{query_prompt}'
def generate_outline_report_prompt(question, context, report_format="apa", total_words=1000):
""" Generates the outline report prompt for the given question and research summary.
Args: question (str): The question to generate the outline report prompt for
research_summary (str): The research summary to generate the outline report prompt for
Returns: str: The outline report prompt for the given question and research summary
"""
return f'"""{context}""" If there is any information above, use it to generate an outline for a research report in Markdown syntax' \
f' for the following question or topic: "{question}". Otherwise say "Not enough data, refine your query". The outline should provide a well-structured framework' \
' for the research report, including the main sections, subsections, and key points to be covered.' \
' The research report should be detailed, informative, in-depth, and a minimum of 1,200 words.' \
' Use appropriate Markdown syntax to format the outline and ensure readability.' \
' Indicate with a citation the academic resource to best support a particular section or subsection or key point.' \
' Think this through step by step. Please do your best, this is very important to my career.'
def generate_critical_flaws_prompt(question, context, report_format="apa", total_words=2000):
""" Generates the outline report prompt but from the OPPOSING view for the given question and research summary.
Args: question (str): The question to generate the outline report prompt for
research_summary (str): The research summary to generate the outline report prompt for
Returns: str: The outline report prompt for the given question and research summary
"""
return f'"""{context}"""\n\nIf there is any information above, use it to generate a bibliography recommendation report for the following' \
f' question or topic: "{question}" Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
' explaining how each source can contribute to finding answers to the research question.\n' \
'Focus on the relevance, reliability, and significance of each source.\n' \
'EXPLICITLY DISCUSS GAPS, FLAWS, LOGICAL ERRORS or implied conflict with another resource you mention that might be present. Pair resources that might be in opposition to each other. \n' \
'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
'Include relevant facts, figures, and numbers whenever available.\n' \
'You MUST include all relevant source urls.\n' \
'Think this through step by step. Please do your best, this is very important to my career.'
def generate_kg_prompt(question, context, report_format="apa", total_words=1000):
return f'"""{context}"""\n\nIf there is any information above, use it to generate appropriate knowledge graph triples for the following' \
f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The triples should accurately represent the key concepts, entities, and relationships' \
' encapsulated within the research question or topic. Return triples like so: [subject],[predicate],[object]\n' \
'Each triple should consist of a subject, predicate, and object, clearly defining how entities are interconnected.\n' \
'Focus on the precision, relevancy, and clarity of each triple.\n' \
'IDENTIFY AND HIGHLIGHT any potential ambiguities or uncertainties that might impact the interpretation or integrity of the knowledge graph.\n' \
'Ensure that the triples are well-structured, ontologically consistent, and can be effectively utilized for constructing a coherent knowledge graph.\n' \
'Include clear definitions for each entity and relationship wherever necessary.\n' \
'Return triples under a separate heading at the end of the report. The set of triples should cover all relevant aspects of the research question or topic, providing a solid foundation for further analysis.\n' \
'You MUST ensure that each triple is valid, actionable, and contributes to an accurate representation of the subject matter.\n' \
'Strategically think through the relationships and entities involved. Please do your best, as this is critical for constructing an informative knowledge graph.'
def generate_old_newspapers_prompt(question, context, report_format="apa", total_words=5000):
return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The information represents badly OCRd text' \
'and should be treated cautiously.\n' \
'Pick out the byline dates and try to create a short summary.\n' \
'ORGANIZE by earliest date first and write the date and the summary.\n' \
'Then create a synoptic view of the main ideas or issues and how they change over time.\n' \
'Be cautious and cite your sources thoroughly by reference to the original newspaper article' \
'Please do your best, as this is critical for constructing an informative data set.'
def generate_archaeology_prompt(question, context, report_format="apa", total_words=5000):
return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
f' question or topic: "{question}". Otherwise say "not enough data, refine your query".' \
'Summarize the available metadata.\n' \
'Categorize the chronological or spatial extent for each category of artefact.\n' \
'Summarize any caveats noted by the investigators.\n' \
'Then write a general synopsis. DO NOT suggest citations or further reading known from your training data.' \
'Please do your best, as this is important for my career.'
def get_report_by_type(report_type):
report_type_mapping = {
'research_report': generate_report_prompt,
'resource_report': generate_resource_report_prompt,
'outline_report': generate_outline_report_prompt,
'critical_flaws': generate_critical_flaws_prompt,
'knowledge_graph': generate_kg_prompt,
'old_newspapers': generate_old_newspapers_prompt,
'archaeology': generate_archaeology_prompt
}
return report_type_mapping[report_type]
def auto_agent_instructions():
return """
This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific server, defined by its type and role, with each server requiring distinct instructions.
Agent
The server is determined by the field of the topic and the specific name of the server that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each server type is associated with a corresponding emoji.
examples:
task: "should I invest in apple stocks?"
response:
{
"server": "💰 Finance Agent",
"agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."
}
task: "could reselling sneakers become profitable?"
response:
{
"server": "📈 Business Analyst Agent",
"agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."
}
task: "what are the most interesting sites in Tel Aviv?"
response:
{
"server: "🌍 Travel Agent",
"agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."
}
task: "How did the events of June 23 in Ottawa Ontario impact debates in the House?"
response:
{
"server: "📚 History Analyst Agent",
"agent_role_prompt": "You are a renowned historian. Your main task is to analyze materials to deduce connections, causes, or influences, writing engaging, insightful, unbiased, and truthful reports from the materials at hand."
}
"""
def generate_summary_prompt(query, data):
""" Generates the summary prompt for the given question and text.
Args: question (str): The question to generate the summary prompt for
text (str): The text to generate the summary prompt for
Returns: str: The summary prompt for the given question and text
"""
return f'{data}\n Using the above text, summarize it based on the following task or query: "{query}".\n If the ' \
f'query cannot be answered using the text, YOU MUST summarize the text in short.\n Include all factual ' \
f'information such as numbers, stats, quotes, etc if available. '
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment