shawngraham/OpenAlex.py

## __init__.py
from .tavily_search.tavily_search import TavilySearch
from .tavily_news.tavily_news import TavilyNews
from .duckduckgo.duckduckgo import Duckduckgo
from .google.google import GoogleSearch
from .serper.serper import SerperSearch
from .serpapi.serpapi import SerpApiSearch
from .searx.searx import SearxSearch
from .bing.bing import BingSearch
from .openalex.openalex import OpenAlexSearch
from .opencontext.opencontext import OpenContextSearch
from .chroniclingamerica.chroniclingamerica import ChroniclingAmericaSearch

__all__ = [
    "TavilySearch",
    "TavilyNews",
    "Duckduckgo",
    "SerperSearch",
    "SerpApiSearch",
    "GoogleSearch",
    "SearxSearch",
    "BingSearch",
    "OpenAlexSearch",
    "OpenContextSearch",
    "ChroniclingAmericaSearch"
]

## chroniclingamerica.py
import requests

class ChroniclingAmericaSearch():
    """
    Chronicling America Search Retriever
    """
    def __init__(self, query):
        """
        Initializes the ChroniclingAmericaSearch object
        Args:
            query: The search query (topic of interest)
        """
        self.query = query

    def search(self, max_results=10):
        """
        Searches the Chronicling America API for articles related to the query
        and retrieves article metadata along with the full text.
        Args:
            max_results: The maximum number of results to retrieve
        Returns:
            A list of dictionaries containing the 'href' (URL) and 'text' (full article text)
        """
        print(f"Searching Chronicling America for '{self.query}'...")
        url = "https://chroniclingamerica.loc.gov/search/pages/results/"
        params = {
            'proxtext': self.query,
            'format': 'json',
            'rows': max_results
        }

        resp = requests.get(url, params=params)

        if resp.status_code != 200:
            print(f"Failed to retrieve data: HTTP {resp.status_code}")
            return []

        search_response = []
        results = resp.json().get('items', [])

        for result in results:
            # Extracting required details from the result
            content = {
                "title": result.get('title_normal', ""),
                "country": result.get('country', ""),
                "ocr_eng": result.get('ocr_eng', "")
            }

            article_url = result.get('url', "No URL provided").replace('.json', '/ocr.txt')

            # Preparing the final data structure
            transformed_result = {
                "href": article_url,
                "body": content
            }

            search_response.append(transformed_result)

        return search_response

## functions.py
import asyncio
from gpt_researcher.utils.llm import *
from gpt_researcher.scraper import Scraper
from gpt_researcher.master.prompts import *
import json


def get_retriever(retriever):
    """
    Gets the retriever
    Args:
        retriever: retriever name

    Returns:
        retriever: Retriever class

    """
    match retriever:
        case "tavily":
            from gpt_researcher.retrievers import TavilySearch
            retriever = TavilySearch
        case "tavily_news":
            from gpt_researcher.retrievers import TavilyNews
            retriever = TavilyNews
        case "google":
            from gpt_researcher.retrievers import GoogleSearch
            retriever = GoogleSearch
        case "searx":
            from gpt_researcher.retrievers import SearxSearch
            retriever = SearxSearch
        case "serpapi":
            raise NotImplementedError("SerpApiSearch is not fully implemented yet.")
            from gpt_researcher.retrievers import SerpApiSearch
            retriever = SerpApiSearch
        case "googleSerp":
            from gpt_researcher.retrievers import SerperSearch
            retriever = SerperSearch
        case "duckduckgo":
            from gpt_researcher.retrievers import Duckduckgo
            retriever = Duckduckgo
        case "BingSearch":
            from gpt_researcher.retrievers import BingSearch
            retriever = BingSearch
        case "OpenAlexSearch":
            from gpt_researcher.retrievers import OpenAlexSearch
            retriever = OpenAlexSearch
        case "OpenContextSearch":
            from gpt_researcher.retrievers import OpenContextSearch
            retriever = OpenContextSearch
        case "ChroniclingAmericaSearch":
            from gpt_researcher.retrievers import ChroniclingAmericaSearch
            retriever = ChroniclingAmericaSearch


        case _:
            raise Exception("Retriever not found.")

    return retriever


async def choose_agent(query, cfg):
    """
    Chooses the agent automatically
    Args:
        query: original query
        cfg: Config

    Returns:
        agent: Agent name
        agent_role_prompt: Agent role prompt
    """
    try:
        response = await create_chat_completion(
            model=cfg.smart_llm_model,
            messages=[
                {"role": "system", "content": f"{auto_agent_instructions()}"},
                {"role": "user", "content": f"task: {query}"}],
            temperature=0,
            llm_provider=cfg.llm_provider
        )
        agent_dict = json.loads(response)
        return agent_dict["server"], agent_dict["agent_role_prompt"]
    except Exception as e:
        return "Default Agent", "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."


async def get_sub_queries(query, agent_role_prompt, cfg):
    """
    Gets the sub queries
    Args:
        query: original query
        agent_role_prompt: agent role prompt
        cfg: Config

    Returns:
        sub_queries: List of sub queries

    """
    max_research_iterations = cfg.max_iterations if cfg.max_iterations else 1
    response = await create_chat_completion(
        model=cfg.smart_llm_model,
        messages=[
            {"role": "system", "content": f"{agent_role_prompt}"},
            {"role": "user", "content": generate_search_queries_prompt(query, max_iterations=max_research_iterations)}],
        temperature=0,
        llm_provider=cfg.llm_provider
    )
    sub_queries = json.loads(response)
    return sub_queries


def scrape_urls(urls, cfg=None):
    """
    Scrapes the urls
    Args:
        urls: List of urls
        cfg: Config (optional)

    Returns:
        text: str

    """
    content = []
    user_agent = cfg.user_agent if cfg else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
    try:
        content = Scraper(urls, user_agent).run()
    except Exception as e:
        print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
    return content


async def summarize(query, content, agent_role_prompt, cfg, websocket=None):
    """
    Asynchronously summarizes a list of URLs.

    Args:
        query (str): The search query.
        content (list): List of dictionaries with 'url' and 'raw_content'.
        agent_role_prompt (str): The role prompt for the agent.
        cfg (object): Configuration object.

    Returns:
        list: A list of dictionaries with 'url' and 'summary'.
    """

    # Function to handle each summarization task for a chunk
    async def handle_task(url, chunk):
        summary = await summarize_url(query, chunk, agent_role_prompt, cfg)
        if summary:
            await stream_output("logs", f"🌐 Summarizing url: {url}", websocket)
            await stream_output("logs", f"📃 {summary}", websocket)
        return url, summary

    # Function to split raw content into chunks of 10,000 words
    def chunk_content(raw_content, chunk_size=10000):
        words = raw_content.split()
        for i in range(0, len(words), chunk_size):
            yield ' '.join(words[i:i+chunk_size])

    # Process each item one by one, but process chunks in parallel
    concatenated_summaries = []
    for item in content:
        url = item['url']
        raw_content = item['raw_content']

        # Create tasks for all chunks of the current URL
        chunk_tasks = [handle_task(url, chunk) for chunk in chunk_content(raw_content)]

        # Run chunk tasks concurrently
        chunk_summaries = await asyncio.gather(*chunk_tasks)

        # Aggregate and concatenate summaries for the current URL
        summaries = [summary for _, summary in chunk_summaries if summary]
        concatenated_summary = ' '.join(summaries)
        concatenated_summaries.append({'url': url, 'summary': concatenated_summary})

    return concatenated_summaries


async def summarize_url(query, raw_data, agent_role_prompt, cfg):
    """
    Summarizes the text
    Args:
        query:
        raw_data:
        agent_role_prompt:
        cfg:

    Returns:
        summary: str

    """
    summary = ""
    try:
        summary = await create_chat_completion(
            model=cfg.fast_llm_model,
            messages=[
                {"role": "system", "content": f"{agent_role_prompt}"},
                {"role": "user", "content": f"{generate_summary_prompt(query, raw_data)}"}],
            temperature=0,
            llm_provider=cfg.llm_provider
        )
    except Exception as e:
        print(f"{Fore.RED}Error in summarize: {e}{Style.RESET_ALL}")
    return summary


async def generate_report(query, context, agent_role_prompt, report_type, websocket, cfg):
    """
    generates the final report
    Args:
        query:
        context:
        agent_role_prompt:
        report_type:
        websocket:
        cfg:

    Returns:
        report:

    """
    generate_prompt = get_report_by_type(report_type)
    report = ""
    try:
        report = await create_chat_completion(
            model=cfg.smart_llm_model,
            messages=[
                {"role": "system", "content": f"{agent_role_prompt}"},
                {"role": "user", "content": f"{generate_prompt(query, context, cfg.report_format, cfg.total_words)}"}],
            temperature=0,
            llm_provider=cfg.llm_provider,
            stream=True,
            websocket=websocket,
            max_tokens=cfg.smart_token_limit
        )
    except Exception as e:
        print(f"{Fore.RED}Error in generate_report: {e}{Style.RESET_ALL}")

    return report


async def stream_output(type, output, websocket=None, logging=True):
    """
    Streams output to the websocket
    Args:
        type:
        output:

    Returns:
        None
    """
    if not websocket or logging:
        print(output)

    if websocket:
        await websocket.send_json({"type": type, "output": output})

## OpenAlex.py
# OpenAlex Search Retriever

import requests
import os

class OpenAlexSearch():
    """
    OpenAlex Search Retriever
    """
    def __init__(self, query):
        """
        Initializes the OpenAlexSearch object
        Args:
            query: The search query (topic of interest)
        """
        self.query = query

    def search(self, max_results=10):
        """
        Searches the OpenAlex API for works related to the query
        Args:
            max_results: The maximum number of results to retrieve
        Returns:
            A list of dictionary objects containing information about each work
        """
        print(f"Searching OpenAlex for '{self.query}'...")
        url = "https://api.openalex.org/works"
        params = {
            'filter': f'title.search:{self.query}',
            'per-page': max_results
        }

        resp = requests.get(url, params=params)

        if resp.status_code != 200:
            print(f"Failed to retrieve data: HTTP {resp.status_code}")
            return None

        try:
            search_results = resp.json()
        except Exception as e:
            print(f"Failed to parse the response: {e}")
            return None

        results = search_results.get("results", [])
        search_response = []

        for result in results:
            # Collect all keyword and concepts strings
            keywords_list = [kw['keyword'] for kw in result.get('keywords', [])]
            concepts_list = [cl['display_name'] for cl in result.get('concepts', [])]

            # Assembling the content/body part
            content = {
                "title": result['display_name'],
                "year": result.get('publication_year', "No year provided"),
                "author": result['authorships'][0]['raw_author_name'] if result.get('authorships') else "No author provided",
                "concepts": concepts_list,
                "keywords": keywords_list
            }

            # Final transformation
            transformed_result = {
                "href": result.get('doi', "No DOI provided"),
                "body": content
            }

            search_response.append(transformed_result)

        return search_response

## opencontext.py
import requests

class OpenContextSearch():
    """
    Open Context Search Retriever
    """
    def __init__(self, query):
        """
        Initializes the OpenContextSearch object
        Args:
            query: The search query (topic of interest)
        """
        self.query = query

    def search(self, max_results=200):
        """
        Searches the Open ContextSearch for articles related to the query
        and retrieves article metadata.

        Returns:
            A list of dictionaries containing the 'href' (URL) and 'body' (metadata)
        """
        print(f"Searching Open Context for '{self.query}'...")
        url = "https://staging.opencontext.org/query/.json?"
        params = {'q': self.query}
        headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

        resp = requests.get(url, params=params, headers=headers)
        print(f"here it is '{resp.url}'")

        if resp.status_code != 200:
            print(f"Failed to retrieve data: HTTP {resp.status_code}")
            return []

        search_response = []
        results = resp.json().get('features', [])  # Direct access to 'features' based on your JSON structure

        for feature in results:  # Iterate directly through each 'feature' in the 'features'
            content = {
                #"id": feature.get('id', ""),
                "label": feature.get('label', ""),
                "when-start": feature.get('when', {}).get('start', ""),
                "when-stop": feature.get('when', {}).get('stop', ""),
                "project": feature.get('properties', {}).get('project label', ""),
                "context-label": feature.get('properties', {}).get('context label', ""),
            }

            article_url = feature.get('properties', {}).get('uri', "No URL provided") # Correcting how the URL is accessed
            if article_url !="No URL provide":
                transformed_result = {
                    "href": article_url + ".json",
                    "body": content
                }

            search_response.append(transformed_result)
            print(search_response)

        return search_response

## prompts.py
from datetime import datetime


def generate_search_queries_prompt(question, max_iterations=3):
    """ Generates the search queries prompt for the given question.
    Args: question (str): The question to generate the search queries prompt for
    Returns: str: The search queries prompt for the given question
    """

    return f'Write {max_iterations} google search queries to search online that form an objective opinion from the following: "{question}"' \
           f'The FIRST QUERY MUST capture just the main NOUN or VERBAL PHRASE.\n' \
           f'Use the current date if needed: {datetime.now().strftime("%B %d, %Y")}.\n' \
           f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3"].\n' \


def generate_report_prompt(question, context, report_format="apa", total_words=1000):
    """ Generates the report prompt for the given question and research summary.
    Args: question (str): The question to generate the report prompt for
            research_summary (str): The research summary to generate the report prompt for
    Returns: str: The report prompt for the given question and research summary
    """

    return f'Information: """{context}"""\n\n' \
           f'IF there is any information above, answer the following' \
           f' query or task: "{question}" in a detailed report. Otherwise write "Not enough data, refine your questions." --' \
           " The report should focus on the answer to the query, should be well structured, informative," \
           f" in depth and comprehensive, with facts and numbers if available and a minimum of {total_words} words.\n" \
           "You should strive to write the report as long as you can using all relevant and necessary information provided.\n" \
           "You must write the report with markdown syntax.\n " \
           f"Use an unbiased and journalistic tone. \n" \
           "You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.\n" \
           f"You MUST write all used source urls at the end of the report as references, and make sure to not add duplicated sources, but only one reference for each.\n" \
           f"You MUST write the report in {report_format} format.\n " \
            f"YOU MUST CITE search results using inline notations. Only cite the most \
            relevant results that answer the query accurately. YOU MUST place these citations at the end \
            of the sentence or paragraph that reference them.\n"\
            f"Let's think this through step by step. Please do your best, this is very important to my career. " \
            f"Assume that the current date is {datetime.now().strftime('%B %d, %Y')}"


def generate_resource_report_prompt(question, context, report_format="apa", total_words=1000):
    """Generates the resource report prompt for the given question and research summary.

    Args:
        question (str): The question to generate the resource report prompt for.
        context (str): The research summary to generate the resource report prompt for.

    Returns:
        str: The resource report prompt for the given question and research summary.
    """
    return f'"""{context}"""\n\nIf there is any information above, use the above information to generate a bibliography recommendation report for the following' \
           f' question or topic: "{question}". Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
           ' explaining how each source can contribute to finding answers to the research question.\n' \
           'Focus on the relevance, reliability, and significance of each source.\n' \
           'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
           'Include relevant facts, figures, and numbers whenever available.\n' \
           'The report should have a minimum length of 700 words.\n' \
            'You MUST include all relevant source urls.\n' \
            'Think this through step by step. Please do your best, this is very important to my career.'

def generate_custom_report_prompt(query_prompt, context, report_format="apa", total_words=1000):
    return f'"{context}"\n\n{query_prompt}'


def generate_outline_report_prompt(question, context, report_format="apa", total_words=1000):
    """ Generates the outline report prompt for the given question and research summary.
    Args: question (str): The question to generate the outline report prompt for
            research_summary (str): The research summary to generate the outline report prompt for
    Returns: str: The outline report prompt for the given question and research summary
    """

    return f'"""{context}""" If there is any information above, use it to generate an outline for a research report in Markdown syntax' \
           f' for the following question or topic: "{question}". Otherwise say "Not enough data, refine your query". The outline should provide a well-structured framework' \
           ' for the research report, including the main sections, subsections, and key points to be covered.' \
           ' The research report should be detailed, informative, in-depth, and a minimum of 1,200 words.' \
           ' Use appropriate Markdown syntax to format the outline and ensure readability.' \
           ' Indicate with a citation the academic resource to best support a particular section or subsection or key point.' \
           ' Think this through step by step. Please do your best, this is very important to my career.'

def generate_critical_flaws_prompt(question, context, report_format="apa", total_words=2000):
    """ Generates the outline report prompt but from the OPPOSING view for the given question and research summary.
    Args: question (str): The question to generate the outline report prompt for
            research_summary (str): The research summary to generate the outline report prompt for
    Returns: str: The outline report prompt for the given question and research summary
    """

    return f'"""{context}"""\n\nIf there is any information above, use it to generate a bibliography recommendation report for the following' \
           f' question or topic: "{question}" Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
            ' explaining how each source can contribute to finding answers to the research question.\n' \
            'Focus on the relevance, reliability, and significance of each source.\n' \
            'EXPLICITLY DISCUSS GAPS, FLAWS, LOGICAL ERRORS or implied conflict with another resource you mention that might be present. Pair resources that might be in opposition to each other. \n' \
            'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
            'Include relevant facts, figures, and numbers whenever available.\n' \
            'You MUST include all relevant source urls.\n' \
            'Think this through step by step. Please do your best, this is very important to my career.'

def generate_kg_prompt(question, context, report_format="apa", total_words=1000):
    return f'"""{context}"""\n\nIf there is any information above, use it to generate appropriate knowledge graph triples for the following' \
           f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The triples should accurately represent the key concepts, entities, and relationships' \
            ' encapsulated within the research question or topic. Return triples like so: [subject],[predicate],[object]\n' \
            'Each triple should consist of a subject, predicate, and object, clearly defining how entities are interconnected.\n' \
            'Focus on the precision, relevancy, and clarity of each triple.\n' \
            'IDENTIFY AND HIGHLIGHT any potential ambiguities or uncertainties that might impact the interpretation or integrity of the knowledge graph.\n' \
            'Ensure that the triples are well-structured, ontologically consistent, and can be effectively utilized for constructing a coherent knowledge graph.\n' \
            'Include clear definitions for each entity and relationship wherever necessary.\n' \
            'Return triples under a separate heading at the end of the report. The set of triples should cover all relevant aspects of the research question or topic, providing a solid foundation for further analysis.\n' \
            'You MUST ensure that each triple is valid, actionable, and contributes to an accurate representation of the subject matter.\n' \
            'Strategically think through the relationships and entities involved. Please do your best, as this is critical for constructing an informative knowledge graph.'

def generate_old_newspapers_prompt(question, context, report_format="apa", total_words=5000):
    return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
           f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The information represents badly OCRd text' \
            'and should be treated cautiously.\n' \
            'Pick out the byline dates and try to create a short summary.\n' \
            'ORGANIZE by earliest date first and write the date and the summary.\n' \
            'Then create a synoptic view of the main ideas or issues and how they change over time.\n' \
            'Be cautious and cite your sources thoroughly by reference to the original newspaper article' \
            'Please do your best, as this is critical for constructing an informative data set.'

def generate_archaeology_prompt(question, context, report_format="apa", total_words=5000):
    return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
           f' question or topic: "{question}". Otherwise say "not enough data, refine your query".' \
            'Summarize the available metadata.\n' \
            'Categorize the chronological or spatial extent for each category of artefact.\n' \
            'Summarize any caveats noted by the investigators.\n' \
            'Then write a general synopsis. DO NOT suggest citations or further reading known from your training data.' \
            'Please do your best, as this is important for my career.'


def get_report_by_type(report_type):
    report_type_mapping = {
        'research_report': generate_report_prompt,
        'resource_report': generate_resource_report_prompt,
        'outline_report': generate_outline_report_prompt,
        'critical_flaws': generate_critical_flaws_prompt,
        'knowledge_graph': generate_kg_prompt,
        'old_newspapers': generate_old_newspapers_prompt,
        'archaeology': generate_archaeology_prompt
    }
    return report_type_mapping[report_type]


def auto_agent_instructions():
    return """
        This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific server, defined by its type and role, with each server requiring distinct instructions.
        Agent
        The server is determined by the field of the topic and the specific name of the server that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each server type is associated with a corresponding emoji.

        examples:
        task: "should I invest in apple stocks?"
        response:
        {
            "server": "💰 Finance Agent",
            "agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."
        }
        task: "could reselling sneakers become profitable?"
        response:
        {
            "server":  "📈 Business Analyst Agent",
            "agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."
        }
        task: "what are the most interesting sites in Tel Aviv?"
        response:
        {
            "server:  "🌍 Travel Agent",
            "agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."
        }
        task: "How did the events of June 23 in Ottawa Ontario impact debates in the House?"
        response:
        {
            "server: "📚 History Analyst Agent",
            "agent_role_prompt": "You are a renowned historian. Your main task is to analyze materials to deduce connections, causes, or influences, writing engaging, insightful, unbiased, and truthful reports from the materials at hand."
        }
    """

def generate_summary_prompt(query, data):
    """ Generates the summary prompt for the given question and text.
    Args: question (str): The question to generate the summary prompt for
            text (str): The text to generate the summary prompt for
    Returns: str: The summary prompt for the given question and text
    """

    return f'{data}\n Using the above text, summarize it based on the following task or query: "{query}".\n If the ' \
           f'query cannot be answered using the text, YOU MUST summarize the text in short.\n Include all factual ' \
           f'information such as numbers, stats, quotes, etc if available. '
	from .tavily_search.tavily_search import TavilySearch
	from .tavily_news.tavily_news import TavilyNews
	from .duckduckgo.duckduckgo import Duckduckgo
	from .google.google import GoogleSearch
	from .serper.serper import SerperSearch
	from .serpapi.serpapi import SerpApiSearch
	from .searx.searx import SearxSearch
	from .bing.bing import BingSearch
	from .openalex.openalex import OpenAlexSearch
	from .opencontext.opencontext import OpenContextSearch
	from .chroniclingamerica.chroniclingamerica import ChroniclingAmericaSearch

	__all__ = [
	"TavilySearch",
	"TavilyNews",
	"Duckduckgo",
	"SerperSearch",
	"SerpApiSearch",
	"GoogleSearch",
	"SearxSearch",
	"BingSearch",
	"OpenAlexSearch",
	"OpenContextSearch",
	"ChroniclingAmericaSearch"
	]
	import requests

	class ChroniclingAmericaSearch():
	"""
	Chronicling America Search Retriever
	"""
	def __init__(self, query):
	"""
	Initializes the ChroniclingAmericaSearch object
	Args:
	query: The search query (topic of interest)
	"""
	self.query = query

	def search(self, max_results=10):
	"""
	Searches the Chronicling America API for articles related to the query
	and retrieves article metadata along with the full text.
	Args:
	max_results: The maximum number of results to retrieve
	Returns:
	A list of dictionaries containing the 'href' (URL) and 'text' (full article text)
	"""
	print(f"Searching Chronicling America for '{self.query}'...")
	url = "https://chroniclingamerica.loc.gov/search/pages/results/"
	params = {
	'proxtext': self.query,
	'format': 'json',
	'rows': max_results
	}

	resp = requests.get(url, params=params)

	if resp.status_code != 200:
	print(f"Failed to retrieve data: HTTP {resp.status_code}")
	return []

	search_response = []
	results = resp.json().get('items', [])

	for result in results:
	# Extracting required details from the result
	content = {
	"title": result.get('title_normal', ""),
	"country": result.get('country', ""),
	"ocr_eng": result.get('ocr_eng', "")
	}

	article_url = result.get('url', "No URL provided").replace('.json', '/ocr.txt')

	# Preparing the final data structure
	transformed_result = {
	"href": article_url,
	"body": content
	}

	search_response.append(transformed_result)

	return search_response
	import asyncio
	from gpt_researcher.utils.llm import *
	from gpt_researcher.scraper import Scraper
	from gpt_researcher.master.prompts import *
	import json


	def get_retriever(retriever):
	"""
	Gets the retriever
	Args:
	retriever: retriever name

	Returns:
	retriever: Retriever class

	"""
	match retriever:
	case "tavily":
	from gpt_researcher.retrievers import TavilySearch
	retriever = TavilySearch
	case "tavily_news":
	from gpt_researcher.retrievers import TavilyNews
	retriever = TavilyNews
	case "google":
	from gpt_researcher.retrievers import GoogleSearch
	retriever = GoogleSearch
	case "searx":
	from gpt_researcher.retrievers import SearxSearch
	retriever = SearxSearch
	case "serpapi":
	raise NotImplementedError("SerpApiSearch is not fully implemented yet.")
	from gpt_researcher.retrievers import SerpApiSearch
	retriever = SerpApiSearch
	case "googleSerp":
	from gpt_researcher.retrievers import SerperSearch
	retriever = SerperSearch
	case "duckduckgo":
	from gpt_researcher.retrievers import Duckduckgo
	retriever = Duckduckgo
	case "BingSearch":
	from gpt_researcher.retrievers import BingSearch
	retriever = BingSearch
	case "OpenAlexSearch":
	from gpt_researcher.retrievers import OpenAlexSearch
	retriever = OpenAlexSearch
	case "OpenContextSearch":
	from gpt_researcher.retrievers import OpenContextSearch
	retriever = OpenContextSearch
	case "ChroniclingAmericaSearch":
	from gpt_researcher.retrievers import ChroniclingAmericaSearch
	retriever = ChroniclingAmericaSearch


	case _:
	raise Exception("Retriever not found.")

	return retriever


	async def choose_agent(query, cfg):
	"""
	Chooses the agent automatically
	Args:
	query: original query
	cfg: Config

	Returns:
	agent: Agent name
	agent_role_prompt: Agent role prompt
	"""
	try:
	response = await create_chat_completion(
	model=cfg.smart_llm_model,
	messages=[
	{"role": "system", "content": f"{auto_agent_instructions()}"},
	{"role": "user", "content": f"task: {query}"}],
	temperature=0,
	llm_provider=cfg.llm_provider
	)
	agent_dict = json.loads(response)
	return agent_dict["server"], agent_dict["agent_role_prompt"]
	except Exception as e:
	return "Default Agent", "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."


	async def get_sub_queries(query, agent_role_prompt, cfg):
	"""
	Gets the sub queries
	Args:
	query: original query
	agent_role_prompt: agent role prompt
	cfg: Config

	Returns:
	sub_queries: List of sub queries

	"""
	max_research_iterations = cfg.max_iterations if cfg.max_iterations else 1
	response = await create_chat_completion(
	model=cfg.smart_llm_model,
	messages=[
	{"role": "system", "content": f"{agent_role_prompt}"},
	{"role": "user", "content": generate_search_queries_prompt(query, max_iterations=max_research_iterations)}],
	temperature=0,
	llm_provider=cfg.llm_provider
	)
	sub_queries = json.loads(response)
	return sub_queries


	def scrape_urls(urls, cfg=None):
	"""
	Scrapes the urls
	Args:
	urls: List of urls
	cfg: Config (optional)

	Returns:
	text: str

	"""
	content = []
	user_agent = cfg.user_agent if cfg else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
	try:
	content = Scraper(urls, user_agent).run()
	except Exception as e:
	print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
	return content


	async def summarize(query, content, agent_role_prompt, cfg, websocket=None):
	"""
	Asynchronously summarizes a list of URLs.

	Args:
	query (str): The search query.
	content (list): List of dictionaries with 'url' and 'raw_content'.
	agent_role_prompt (str): The role prompt for the agent.
	cfg (object): Configuration object.

	Returns:
	list: A list of dictionaries with 'url' and 'summary'.
	"""

	# Function to handle each summarization task for a chunk
	async def handle_task(url, chunk):
	summary = await summarize_url(query, chunk, agent_role_prompt, cfg)
	if summary:
	await stream_output("logs", f"🌐 Summarizing url: {url}", websocket)
	await stream_output("logs", f"📃 {summary}", websocket)
	return url, summary

	# Function to split raw content into chunks of 10,000 words
	def chunk_content(raw_content, chunk_size=10000):
	words = raw_content.split()
	for i in range(0, len(words), chunk_size):
	yield ' '.join(words[i:i+chunk_size])

	# Process each item one by one, but process chunks in parallel
	concatenated_summaries = []
	for item in content:
	url = item['url']
	raw_content = item['raw_content']

	# Create tasks for all chunks of the current URL
	chunk_tasks = [handle_task(url, chunk) for chunk in chunk_content(raw_content)]

	# Run chunk tasks concurrently
	chunk_summaries = await asyncio.gather(*chunk_tasks)

	# Aggregate and concatenate summaries for the current URL
	summaries = [summary for _, summary in chunk_summaries if summary]
	concatenated_summary = ' '.join(summaries)
	concatenated_summaries.append({'url': url, 'summary': concatenated_summary})

	return concatenated_summaries


	async def summarize_url(query, raw_data, agent_role_prompt, cfg):
	"""
	Summarizes the text
	Args:
	query:
	raw_data:
	agent_role_prompt:
	cfg:

	Returns:
	summary: str

	"""
	summary = ""
	try:
	summary = await create_chat_completion(
	model=cfg.fast_llm_model,
	messages=[
	{"role": "system", "content": f"{agent_role_prompt}"},
	{"role": "user", "content": f"{generate_summary_prompt(query, raw_data)}"}],
	temperature=0,
	llm_provider=cfg.llm_provider
	)
	except Exception as e:
	print(f"{Fore.RED}Error in summarize: {e}{Style.RESET_ALL}")
	return summary



	async def generate_report(query, context, agent_role_prompt, report_type, websocket, cfg):
	"""
	generates the final report
	Args:
	query:
	context:
	agent_role_prompt:
	report_type:
	websocket:
	cfg:

	Returns:
	report:

	"""
	generate_prompt = get_report_by_type(report_type)
	report = ""
	try:
	report = await create_chat_completion(
	model=cfg.smart_llm_model,
	messages=[
	{"role": "system", "content": f"{agent_role_prompt}"},
	{"role": "user", "content": f"{generate_prompt(query, context, cfg.report_format, cfg.total_words)}"}],
	temperature=0,
	llm_provider=cfg.llm_provider,
	stream=True,
	websocket=websocket,
	max_tokens=cfg.smart_token_limit
	)
	except Exception as e:
	print(f"{Fore.RED}Error in generate_report: {e}{Style.RESET_ALL}")

	return report


	async def stream_output(type, output, websocket=None, logging=True):
	"""
	Streams output to the websocket
	Args:
	type:
	output:

	Returns:
	None
	"""
	if not websocket or logging:
	print(output)

	if websocket:
	await websocket.send_json({"type": type, "output": output})
	# OpenAlex Search Retriever

	import requests
	import os

	class OpenAlexSearch():
	"""
	OpenAlex Search Retriever
	"""
	def __init__(self, query):
	"""
	Initializes the OpenAlexSearch object
	Args:
	query: The search query (topic of interest)
	"""
	self.query = query

	def search(self, max_results=10):
	"""
	Searches the OpenAlex API for works related to the query
	Args:
	max_results: The maximum number of results to retrieve
	Returns:
	A list of dictionary objects containing information about each work
	"""
	print(f"Searching OpenAlex for '{self.query}'...")
	url = "https://api.openalex.org/works"
	params = {
	'filter': f'title.search:{self.query}',
	'per-page': max_results
	}

	resp = requests.get(url, params=params)

	if resp.status_code != 200:
	print(f"Failed to retrieve data: HTTP {resp.status_code}")
	return None

	try:
	search_results = resp.json()
	except Exception as e:
	print(f"Failed to parse the response: {e}")
	return None

	results = search_results.get("results", [])
	search_response = []

	for result in results:
	# Collect all keyword and concepts strings
	keywords_list = [kw['keyword'] for kw in result.get('keywords', [])]
	concepts_list = [cl['display_name'] for cl in result.get('concepts', [])]

	# Assembling the content/body part
	content = {
	"title": result['display_name'],
	"year": result.get('publication_year', "No year provided"),
	"author": result['authorships'][0]['raw_author_name'] if result.get('authorships') else "No author provided",
	"concepts": concepts_list,
	"keywords": keywords_list
	}

	# Final transformation
	transformed_result = {
	"href": result.get('doi', "No DOI provided"),
	"body": content
	}

	search_response.append(transformed_result)

	return search_response
	import requests

	class OpenContextSearch():
	"""
	Open Context Search Retriever
	"""
	def __init__(self, query):
	"""
	Initializes the OpenContextSearch object
	Args:
	query: The search query (topic of interest)
	"""
	self.query = query

	def search(self, max_results=200):
	"""
	Searches the Open ContextSearch for articles related to the query
	and retrieves article metadata.

	Returns:
	A list of dictionaries containing the 'href' (URL) and 'body' (metadata)
	"""
	print(f"Searching Open Context for '{self.query}'...")
	url = "https://staging.opencontext.org/query/.json?"
	params = {'q': self.query}
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
	}

	resp = requests.get(url, params=params, headers=headers)
	print(f"here it is '{resp.url}'")

	if resp.status_code != 200:
	print(f"Failed to retrieve data: HTTP {resp.status_code}")
	return []

	search_response = []
	results = resp.json().get('features', []) # Direct access to 'features' based on your JSON structure

	for feature in results: # Iterate directly through each 'feature' in the 'features'
	content = {
	#"id": feature.get('id', ""),
	"label": feature.get('label', ""),
	"when-start": feature.get('when', {}).get('start', ""),
	"when-stop": feature.get('when', {}).get('stop', ""),
	"project": feature.get('properties', {}).get('project label', ""),
	"context-label": feature.get('properties', {}).get('context label', ""),
	}

	article_url = feature.get('properties', {}).get('uri', "No URL provided") # Correcting how the URL is accessed
	if article_url !="No URL provide":
	transformed_result = {
	"href": article_url + ".json",
	"body": content
	}

	search_response.append(transformed_result)
	print(search_response)

	return search_response
	from datetime import datetime


	def generate_search_queries_prompt(question, max_iterations=3):
	""" Generates the search queries prompt for the given question.
	Args: question (str): The question to generate the search queries prompt for
	Returns: str: The search queries prompt for the given question
	"""

	return f'Write {max_iterations} google search queries to search online that form an objective opinion from the following: "{question}"' \
	f'The FIRST QUERY MUST capture just the main NOUN or VERBAL PHRASE.\n' \
	f'Use the current date if needed: {datetime.now().strftime("%B %d, %Y")}.\n' \
	f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3"].\n' \



	def generate_report_prompt(question, context, report_format="apa", total_words=1000):
	""" Generates the report prompt for the given question and research summary.
	Args: question (str): The question to generate the report prompt for
	research_summary (str): The research summary to generate the report prompt for
	Returns: str: The report prompt for the given question and research summary
	"""

	return f'Information: """{context}"""\n\n' \
	f'IF there is any information above, answer the following' \
	f' query or task: "{question}" in a detailed report. Otherwise write "Not enough data, refine your questions." --' \
	" The report should focus on the answer to the query, should be well structured, informative," \
	f" in depth and comprehensive, with facts and numbers if available and a minimum of {total_words} words.\n" \
	"You should strive to write the report as long as you can using all relevant and necessary information provided.\n" \
	"You must write the report with markdown syntax.\n " \
	f"Use an unbiased and journalistic tone. \n" \
	"You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.\n" \
	f"You MUST write all used source urls at the end of the report as references, and make sure to not add duplicated sources, but only one reference for each.\n" \
	f"You MUST write the report in {report_format} format.\n " \
	f"YOU MUST CITE search results using inline notations. Only cite the most \
	relevant results that answer the query accurately. YOU MUST place these citations at the end \
	of the sentence or paragraph that reference them.\n"\
	f"Let's think this through step by step. Please do your best, this is very important to my career. " \
	f"Assume that the current date is {datetime.now().strftime('%B %d, %Y')}"


	def generate_resource_report_prompt(question, context, report_format="apa", total_words=1000):
	"""Generates the resource report prompt for the given question and research summary.

	Args:
	question (str): The question to generate the resource report prompt for.
	context (str): The research summary to generate the resource report prompt for.

	Returns:
	str: The resource report prompt for the given question and research summary.
	"""
	return f'"""{context}"""\n\nIf there is any information above, use the above information to generate a bibliography recommendation report for the following' \
	f' question or topic: "{question}". Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
	' explaining how each source can contribute to finding answers to the research question.\n' \
	'Focus on the relevance, reliability, and significance of each source.\n' \
	'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
	'Include relevant facts, figures, and numbers whenever available.\n' \
	'The report should have a minimum length of 700 words.\n' \
	'You MUST include all relevant source urls.\n' \
	'Think this through step by step. Please do your best, this is very important to my career.'

	def generate_custom_report_prompt(query_prompt, context, report_format="apa", total_words=1000):
	return f'"{context}"\n\n{query_prompt}'


	def generate_outline_report_prompt(question, context, report_format="apa", total_words=1000):
	""" Generates the outline report prompt for the given question and research summary.
	Args: question (str): The question to generate the outline report prompt for
	research_summary (str): The research summary to generate the outline report prompt for
	Returns: str: The outline report prompt for the given question and research summary
	"""

	return f'"""{context}""" If there is any information above, use it to generate an outline for a research report in Markdown syntax' \
	f' for the following question or topic: "{question}". Otherwise say "Not enough data, refine your query". The outline should provide a well-structured framework' \
	' for the research report, including the main sections, subsections, and key points to be covered.' \
	' The research report should be detailed, informative, in-depth, and a minimum of 1,200 words.' \
	' Use appropriate Markdown syntax to format the outline and ensure readability.' \
	' Indicate with a citation the academic resource to best support a particular section or subsection or key point.' \
	' Think this through step by step. Please do your best, this is very important to my career.'

	def generate_critical_flaws_prompt(question, context, report_format="apa", total_words=2000):
	""" Generates the outline report prompt but from the OPPOSING view for the given question and research summary.
	Args: question (str): The question to generate the outline report prompt for
	research_summary (str): The research summary to generate the outline report prompt for
	Returns: str: The outline report prompt for the given question and research summary
	"""

	return f'"""{context}"""\n\nIf there is any information above, use it to generate a bibliography recommendation report for the following' \
	f' question or topic: "{question}" Otherwise say "Not enough data, refine your query". The report should provide a detailed analysis of each recommended resource,' \
	' explaining how each source can contribute to finding answers to the research question.\n' \
	'Focus on the relevance, reliability, and significance of each source.\n' \
	'EXPLICITLY DISCUSS GAPS, FLAWS, LOGICAL ERRORS or implied conflict with another resource you mention that might be present. Pair resources that might be in opposition to each other. \n' \
	'Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.\n' \
	'Include relevant facts, figures, and numbers whenever available.\n' \
	'You MUST include all relevant source urls.\n' \
	'Think this through step by step. Please do your best, this is very important to my career.'

	def generate_kg_prompt(question, context, report_format="apa", total_words=1000):
	return f'"""{context}"""\n\nIf there is any information above, use it to generate appropriate knowledge graph triples for the following' \
	f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The triples should accurately represent the key concepts, entities, and relationships' \
	' encapsulated within the research question or topic. Return triples like so: [subject],[predicate],[object]\n' \
	'Each triple should consist of a subject, predicate, and object, clearly defining how entities are interconnected.\n' \
	'Focus on the precision, relevancy, and clarity of each triple.\n' \
	'IDENTIFY AND HIGHLIGHT any potential ambiguities or uncertainties that might impact the interpretation or integrity of the knowledge graph.\n' \
	'Ensure that the triples are well-structured, ontologically consistent, and can be effectively utilized for constructing a coherent knowledge graph.\n' \
	'Include clear definitions for each entity and relationship wherever necessary.\n' \
	'Return triples under a separate heading at the end of the report. The set of triples should cover all relevant aspects of the research question or topic, providing a solid foundation for further analysis.\n' \
	'You MUST ensure that each triple is valid, actionable, and contributes to an accurate representation of the subject matter.\n' \
	'Strategically think through the relationships and entities involved. Please do your best, as this is critical for constructing an informative knowledge graph.'

	def generate_old_newspapers_prompt(question, context, report_format="apa", total_words=5000):
	return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
	f' question or topic: "{question}". Otherwise say "not enough data, refine your query". The information represents badly OCRd text' \
	'and should be treated cautiously.\n' \
	'Pick out the byline dates and try to create a short summary.\n' \
	'ORGANIZE by earliest date first and write the date and the summary.\n' \
	'Then create a synoptic view of the main ideas or issues and how they change over time.\n' \
	'Be cautious and cite your sources thoroughly by reference to the original newspaper article' \
	'Please do your best, as this is critical for constructing an informative data set.'

	def generate_archaeology_prompt(question, context, report_format="apa", total_words=5000):
	return f'"""{context}"""\n\nIf there is any information above, use it to consider the following' \
	f' question or topic: "{question}". Otherwise say "not enough data, refine your query".' \
	'Summarize the available metadata.\n' \
	'Categorize the chronological or spatial extent for each category of artefact.\n' \
	'Summarize any caveats noted by the investigators.\n' \
	'Then write a general synopsis. DO NOT suggest citations or further reading known from your training data.' \
	'Please do your best, as this is important for my career.'



	def get_report_by_type(report_type):
	report_type_mapping = {
	'research_report': generate_report_prompt,
	'resource_report': generate_resource_report_prompt,
	'outline_report': generate_outline_report_prompt,
	'critical_flaws': generate_critical_flaws_prompt,
	'knowledge_graph': generate_kg_prompt,
	'old_newspapers': generate_old_newspapers_prompt,
	'archaeology': generate_archaeology_prompt
	}
	return report_type_mapping[report_type]


	def auto_agent_instructions():
	return """
	This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific server, defined by its type and role, with each server requiring distinct instructions.
	Agent
	The server is determined by the field of the topic and the specific name of the server that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each server type is associated with a corresponding emoji.

	examples:
	task: "should I invest in apple stocks?"
	response:
	{
	"server": "💰 Finance Agent",
	"agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."
	}
	task: "could reselling sneakers become profitable?"
	response:
	{
	"server": "📈 Business Analyst Agent",
	"agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."
	}
	task: "what are the most interesting sites in Tel Aviv?"
	response:
	{
	"server: "🌍 Travel Agent",
	"agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."
	}
	task: "How did the events of June 23 in Ottawa Ontario impact debates in the House?"
	response:
	{
	"server: "📚 History Analyst Agent",
	"agent_role_prompt": "You are a renowned historian. Your main task is to analyze materials to deduce connections, causes, or influences, writing engaging, insightful, unbiased, and truthful reports from the materials at hand."
	}
	"""

	def generate_summary_prompt(query, data):
	""" Generates the summary prompt for the given question and text.
	Args: question (str): The question to generate the summary prompt for
	text (str): The text to generate the summary prompt for
	Returns: str: The summary prompt for the given question and text
	"""

	return f'{data}\n Using the above text, summarize it based on the following task or query: "{query}".\n If the ' \
	f'query cannot be answered using the text, YOU MUST summarize the text in short.\n Include all factual ' \
	f'information such as numbers, stats, quotes, etc if available. '