codeninja/Arxive rag agent

## Arxive rag agent
# from autogen.agentchat.contrib.teachable_agent import TeachableAgent
# from autogen.agentchat.user_proxy_agent import UserProxyAgent
# from autogen.agentchat.conversable_agent import ConversableAgent
import os
import re
import json
import autogen
import autogen.retrieve_utils as retrieve_utils
import chromadb
import feedparser
import requests
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from src.lib.termination_msg import term_eom
from autogen.agentchat.contrib.teachable_agent import TeachableAgent

# Define a path for the JSON file to store structured notes and read status
STRUCTURED_NOTES_DB_PATH = "structured_notes_db.json"


# Function to load structured notes from the JSON file
def load_structured_notes():
    if not os.path.exists(STRUCTURED_NOTES_DB_PATH):
        return {}
    with open(STRUCTURED_NOTES_DB_PATH, "r") as file:
        return json.load(file)


# Function to save structured notes to the JSON file
def save_structured_notes(notes_db):
    with open(STRUCTURED_NOTES_DB_PATH, "w") as file:
        json.dump(notes_db, file, indent=4)


class ArxivAgent(autogen.agentchat.Agent):
    def search_by_date(self, start_date, end_date, query, max_results=10):
        """
        Search arXiv for papers published between start_date and end_date with a specific query.

        :param start_date: The start date for the search in the format YYYY-MM-DD.
        :param end_date: The end date for the search in the format YYYY-MM-DD.
        :param query: The query to search for.
        :param max_results: The maximum number of results to return.
        :return: A list of papers that match the query and were published between the start and end dates.
        """
        base_url = "http://export.arxiv.org/api/query?"
        search_query = (
            f"search_query={query}+AND+submittedDate:[{start_date}+TO+{end_date}]"
        )
        start = 0
        max_results = f"max_results={max_results}"
        url = f"{base_url}{search_query}&start={start}&{max_results}"
        response = requests.get(url)
        feed = feedparser.parse(response.content)

        papers = [
            {
                "title": entry.title,
                "link": entry.link,
                "summary": entry.summary,
                "date": entry.published,
                "category": entry.arxiv_primary_category["term"]
                if "arxiv_primary_category" in entry
                else entry.tags[0]["term"],
            }
            for entry in feed.entries
        ]
        return papers

    seed = "arxiv"

    def __init__(
        self,
        name: str,
        llm_config: dict = {},
        human_input_mode="COMPLETE",
        code_execution_config={"work_dir": "arxiv"},
        is_termination_msg=term_eom,
    ):
        # Using dict.setdefault to optimize default settings for llm config
        llm_config.setdefault("seed", self.seed)
        self.seed = llm_config["seed"]

        llm_config["config_list"] = autogen.config_list_from_json(
            "OAI_CONFIG_LIST",
            filter_dict={
                "model": [
                    "gpt-4",
                    "gpt-4-0613",
                    "gpt-3.5-turbo",
                    "gpt-3.5-turbo-0613",
                    "gpt-3.5-turbo-16k",
                    "gpt-3.5-turbo-16k-0613",
                    "gpt-4-1106-preview",
                ],
            },
        )
        llm_config.setdefault("temperature", 0)
        llm_config.setdefault("model", "gpt-3.5-turbo-0613")

        llm_config.setdefault(
            "functions",
            [
                self.queryFunction,
                self.downloadFunction,
                self.summarizeFunction,
                self.readPdfFunction,
            ],
        )

        system_message = """You are a research librarian tracking scientific papers.

            You have several tasks you can complete:
            - /chat: [default] chat with the user, answering questions about research you've read.
            - /search: query for new papers on a topic with the query_arxiv function.
            - /searchResults: You must summarize the result and print the Date, Title, Category, Arxiv Link, PDF Link, and Summary in markdown format.
            - /download: download a pdf from a url with the download_pdf function
            - /read: open the pdf and extract the text using the read_pdf function. After you read the pdf, you must create tangiable structured notes on the paper starting with the title, summary, key details, learnings, recomendations, potential applications. Include critical details that we would need to be able to recall in planning sessions when discussing future product ideas. The goal is to be able to link cutting edge research to product ideas.
            - /summarize: summarize a paper into a short paragraph with the summarize_paper function, effects, and significance
            - /notate: generate detailed structured notes on a paper with the write_notes function
            - /report: Provide a report when provided research data detailing the function, effects, and significance of all the research combined.
            - /help: print this message
            - /terminate: terminate the conversation


            Once a command is complete, append a `TERMINATE` message to the end of the message to terminate the conversation.
            The user can not execute code directly. They must use the functions provided.
            """

        self.teach_config = {
            "verbosity": 1,  # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
            "reset_db": False,  # Set to True to start over with an empty database.
            "path_to_db_dir": ".cache/research",  # Path to the directory where the database will be stored.
            "recall_threshold": 1.5,  # Higher numbers allow more (but less relevant) memos to be recalled.
        }

        self.agent = TeachableAgent(
            name="teachableagent",
            llm_config=llm_config,
            teach_config=self.teach_config,
            system_message=system_message,
        )

        self.function_map = {
            "query_arxiv": self.query_arxiv,
            "download_pdf": self.download_pdf,
            "summarize_paper": self.summarize_paper,
            "read_pdf": self.read_pdf,
        }

        self.agent.register_function(self.function_map)

        ragConfig = {
            "task": "text_to_text_generation",
            "docs_path": ["/src/global_context/research/"],
            "chunk_token_size": 1000,
            "model": llm_config["config_list"][0]["model"],
            "client": chromadb.PersistentClient(path="./arxiv/chromadb"),
            "collection_name": "arxiv",
            "get_or_create": True,
        }

        self.ragAgent = RetrieveAssistantAgent(
            name="RagAgent",
            llm_config=llm_config,
            system_message="RagAgent. Retrieve the answer from the knowledge base.",
            human_input_mode="COMPLETE",
            code_execution_config={"work_dir": "arxiv"},
        )

        self.ragUserProxy = RetrieveUserProxyAgent(
            name="RagUserProxy",
            human_input_mode="NEVER",
            retrieve_config=ragConfig,
        )

        self.userProxy = autogen.UserProxyAgent(
            name="User",
            human_input_mode="ALWAYS",
            code_execution_config={"work_dir": "arxiv"},
        )

        self.critic = autogen.ConversableAgent(
            name="Critic",
            llm_config={
                "temperature": 0.2,
                "request_timeout": 600,
                "seed": "arxiv",
                "model": "gpt-3.5-turbo-0613",
                "config_list": autogen.config_list_openai_aoai(exclude="aoai"),
            },
            human_input_mode="COMPLETE",
            system_message="Critic. Critique the plan, the execution, the result, and the conversation. Do not critique the user.",
        )

        self.groupchat = autogen.GroupChat(
            agents=[
                # self.userProxy,
                # self.ragAgent,
                # self.ragUserProxy,
                self.agent,
                self.critic,
            ],
            messages=[],
            max_round=50,
        )

        self.groupchatManager = autogen.GroupChatManager(
            groupchat=self.groupchat, llm_config=llm_config
        )
        # Initialize any additional state or configuration here

    def get_agent(self):
        return self.agent

    queryFunction = {
        "name": "query_arxiv",
        "description": "query arxiv for a topic",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The query to search for.",
                },
                "max_results": {
                    "type": "integer",
                    "description": "The maximum number of results to return.",
                },
            },
            "required": ["query"],
        },
    }

    def query_arxiv(
        self,
        query: str,
        max_results: int = 10,
        start_date: str = None,
        end_date: str = None,
    ):
        base_url = "http://export.arxiv.org/api/query?"
        search_query = f"search_query=all:{query}"
        if start_date and end_date:
            search_query += f"+AND+submittedDate:[{start_date}+TO+{end_date}]"
        start = 0
        max_results = f"max_results={max_results}"
        url = f"{base_url}{search_query}&start={start}&{max_results}"
        response = requests.get(url)
        feed = feedparser.parse(response.content)

        papers = [
            {
                "title": entry.title,
                "link": entry.link,
                "summary": entry.summary,
                "date": entry.published,
                "category": entry.arxiv_primary_category["term"]
                if "arxiv_primary_category" in entry
                else entry.tags[0]["term"],
            }
            for entry in feed.entries
        ]
        return "/searchResults " + str(papers)

    downloadFunction = {
        "name": "download_pdf",
        "description": "download a pdf from a url",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The url to download the pdf from.",
                },
                "filename": {
                    "type": "string",
                    "description": "The filename to save the pdf as. This should match ArXiv's file name.",
                },
            },
            "required": ["url"],
        },
    }

    def download_pdf(self, url: str, filename: str) -> str:
        """
        Download a pdf from a url and save it in a topic categorized folder.

        :param url: The url to download the pdf from.
        :param topic: The research topic to categorize the pdf.
        :return: The path to the downloaded pdf.
        """
        # Sanitize the topic string to create a valid directory name

        # Create the directory path for the topic
        topic_dir = os.path.join("src", "global_context", "research")
        os.makedirs(topic_dir, exist_ok=True)

        # Sanitize the filename string to create a valid filename make sure to include the .pdf extension
        sanitized_filename = (
            re.sub(r"[^\w\s-]", "", filename.replace(".pdf", ""))
            .strip()
            .lower()
            .replace(" ", "_")
            + ".pdf"
        )

        # Create the full path for the pdf
        pdf_path = os.path.join(topic_dir, sanitized_filename)

        # Download and save the pdf
        response = requests.get(url)
        with open(pdf_path, "wb") as f:
            f.write(response.content)

        return pdf_path

    summarizeFunction = {
        "name": "summarize_paper",
        "description": "summarize a paper into a short paragraph",
        "parameters": {
            "type": "object",
            "properties": {
                "filename": {
                    "type": "string",
                    "description": "The path to the pdf to summarize.",
                },
            },
            "required": ["filename"],
        },
    }

    def summarize_paper(self, filename: str) -> str:
        self.ragUserProxy.initiate_chat(
            self.ragAgent,
            context=f"/summarize the paper {filename}",
        )
        return self.ragAgent.last_message()["content"]

    readPdfFunction = {
        "name": "read_pdf",
        "description": "read a pdf and extract the text",
        "parameters": {
            "type": "object",
            "properties": {
                "filename": {
                    "type": "string",
                    "description": "The filename of the pdf to read.",
                },
            },
            "required": ["filename"],
        },
    }

    def read_pdf(self, filename: str) -> str:
        # Load the structured notes database
        notes_db = load_structured_notes()

        # Check if the PDF has been read previously
        if filename in notes_db:
            # Return the structured notes if available
            return notes_db[filename]["notes"]

        # Read the PDF and generate structured notes
        file_dir = os.path.join("src", "global_context", "research", filename)
        structured_notes = retrieve_utils.extract_text_from_pdf(file_dir)

        # Save the structured notes and read status to the database
        notes_db[filename] = {"notes": structured_notes, "read": True}
        save_structured_notes(notes_db)

        return structured_notes
	# from autogen.agentchat.contrib.teachable_agent import TeachableAgent
	# from autogen.agentchat.user_proxy_agent import UserProxyAgent
	# from autogen.agentchat.conversable_agent import ConversableAgent
	import os
	import re
	import json
	import autogen
	import autogen.retrieve_utils as retrieve_utils
	import chromadb
	import feedparser
	import requests
	from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
	from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
	from src.lib.termination_msg import term_eom
	from autogen.agentchat.contrib.teachable_agent import TeachableAgent

	# Define a path for the JSON file to store structured notes and read status
	STRUCTURED_NOTES_DB_PATH = "structured_notes_db.json"


	# Function to load structured notes from the JSON file
	def load_structured_notes():
	if not os.path.exists(STRUCTURED_NOTES_DB_PATH):
	return {}
	with open(STRUCTURED_NOTES_DB_PATH, "r") as file:
	return json.load(file)


	# Function to save structured notes to the JSON file
	def save_structured_notes(notes_db):
	with open(STRUCTURED_NOTES_DB_PATH, "w") as file:
	json.dump(notes_db, file, indent=4)


	class ArxivAgent(autogen.agentchat.Agent):
	def search_by_date(self, start_date, end_date, query, max_results=10):
	"""
	Search arXiv for papers published between start_date and end_date with a specific query.

	:param start_date: The start date for the search in the format YYYY-MM-DD.
	:param end_date: The end date for the search in the format YYYY-MM-DD.
	:param query: The query to search for.
	:param max_results: The maximum number of results to return.
	:return: A list of papers that match the query and were published between the start and end dates.
	"""
	base_url = "http://export.arxiv.org/api/query?"
	search_query = (
	f"search_query={query}+AND+submittedDate:[{start_date}+TO+{end_date}]"
	)
	start = 0
	max_results = f"max_results={max_results}"
	url = f"{base_url}{search_query}&start={start}&{max_results}"
	response = requests.get(url)
	feed = feedparser.parse(response.content)

	papers = [
	{
	"title": entry.title,
	"link": entry.link,
	"summary": entry.summary,
	"date": entry.published,
	"category": entry.arxiv_primary_category["term"]
	if "arxiv_primary_category" in entry
	else entry.tags[0]["term"],
	}
	for entry in feed.entries
	]
	return papers

	seed = "arxiv"

	def __init__(
	self,
	name: str,
	llm_config: dict = {},
	human_input_mode="COMPLETE",
	code_execution_config={"work_dir": "arxiv"},
	is_termination_msg=term_eom,
	):
	# Using dict.setdefault to optimize default settings for llm config
	llm_config.setdefault("seed", self.seed)
	self.seed = llm_config["seed"]

	llm_config["config_list"] = autogen.config_list_from_json(
	"OAI_CONFIG_LIST",
	filter_dict={
	"model": [
	"gpt-4",
	"gpt-4-0613",
	"gpt-3.5-turbo",
	"gpt-3.5-turbo-0613",
	"gpt-3.5-turbo-16k",
	"gpt-3.5-turbo-16k-0613",
	"gpt-4-1106-preview",
	],
	},
	)
	llm_config.setdefault("temperature", 0)
	llm_config.setdefault("model", "gpt-3.5-turbo-0613")

	llm_config.setdefault(
	"functions",
	[
	self.queryFunction,
	self.downloadFunction,
	self.summarizeFunction,
	self.readPdfFunction,
	],
	)

	system_message = """You are a research librarian tracking scientific papers.

	You have several tasks you can complete:
	- /chat: [default] chat with the user, answering questions about research you've read.
	- /search: query for new papers on a topic with the query_arxiv function.
	- /searchResults: You must summarize the result and print the Date, Title, Category, Arxiv Link, PDF Link, and Summary in markdown format.
	- /download: download a pdf from a url with the download_pdf function
	- /read: open the pdf and extract the text using the read_pdf function. After you read the pdf, you must create tangiable structured notes on the paper starting with the title, summary, key details, learnings, recomendations, potential applications. Include critical details that we would need to be able to recall in planning sessions when discussing future product ideas. The goal is to be able to link cutting edge research to product ideas.
	- /summarize: summarize a paper into a short paragraph with the summarize_paper function, effects, and significance
	- /notate: generate detailed structured notes on a paper with the write_notes function
	- /report: Provide a report when provided research data detailing the function, effects, and significance of all the research combined.
	- /help: print this message
	- /terminate: terminate the conversation


	Once a command is complete, append a `TERMINATE` message to the end of the message to terminate the conversation.
	The user can not execute code directly. They must use the functions provided.
	"""

	self.teach_config = {
	"verbosity": 1, # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
	"reset_db": False, # Set to True to start over with an empty database.
	"path_to_db_dir": ".cache/research", # Path to the directory where the database will be stored.
	"recall_threshold": 1.5, # Higher numbers allow more (but less relevant) memos to be recalled.
	}

	self.agent = TeachableAgent(
	name="teachableagent",
	llm_config=llm_config,
	teach_config=self.teach_config,
	system_message=system_message,
	)

	self.function_map = {
	"query_arxiv": self.query_arxiv,
	"download_pdf": self.download_pdf,
	"summarize_paper": self.summarize_paper,
	"read_pdf": self.read_pdf,
	}

	self.agent.register_function(self.function_map)

	ragConfig = {
	"task": "text_to_text_generation",
	"docs_path": ["/src/global_context/research/"],
	"chunk_token_size": 1000,
	"model": llm_config["config_list"][0]["model"],
	"client": chromadb.PersistentClient(path="./arxiv/chromadb"),
	"collection_name": "arxiv",
	"get_or_create": True,
	}

	self.ragAgent = RetrieveAssistantAgent(
	name="RagAgent",
	llm_config=llm_config,
	system_message="RagAgent. Retrieve the answer from the knowledge base.",
	human_input_mode="COMPLETE",
	code_execution_config={"work_dir": "arxiv"},
	)

	self.ragUserProxy = RetrieveUserProxyAgent(
	name="RagUserProxy",
	human_input_mode="NEVER",
	retrieve_config=ragConfig,
	)

	self.userProxy = autogen.UserProxyAgent(
	name="User",
	human_input_mode="ALWAYS",
	code_execution_config={"work_dir": "arxiv"},
	)

	self.critic = autogen.ConversableAgent(
	name="Critic",
	llm_config={
	"temperature": 0.2,
	"request_timeout": 600,
	"seed": "arxiv",
	"model": "gpt-3.5-turbo-0613",
	"config_list": autogen.config_list_openai_aoai(exclude="aoai"),
	},
	human_input_mode="COMPLETE",
	system_message="Critic. Critique the plan, the execution, the result, and the conversation. Do not critique the user.",
	)

	self.groupchat = autogen.GroupChat(
	agents=[
	# self.userProxy,
	# self.ragAgent,
	# self.ragUserProxy,
	self.agent,
	self.critic,
	],
	messages=[],
	max_round=50,
	)

	self.groupchatManager = autogen.GroupChatManager(
	groupchat=self.groupchat, llm_config=llm_config
	)
	# Initialize any additional state or configuration here

	def get_agent(self):
	return self.agent

	queryFunction = {
	"name": "query_arxiv",
	"description": "query arxiv for a topic",
	"parameters": {
	"type": "object",
	"properties": {
	"query": {
	"type": "string",
	"description": "The query to search for.",
	},
	"max_results": {
	"type": "integer",
	"description": "The maximum number of results to return.",
	},
	},
	"required": ["query"],
	},
	}

	def query_arxiv(
	self,
	query: str,
	max_results: int = 10,
	start_date: str = None,
	end_date: str = None,
	):
	base_url = "http://export.arxiv.org/api/query?"
	search_query = f"search_query=all:{query}"
	if start_date and end_date:
	search_query += f"+AND+submittedDate:[{start_date}+TO+{end_date}]"
	start = 0
	max_results = f"max_results={max_results}"
	url = f"{base_url}{search_query}&start={start}&{max_results}"
	response = requests.get(url)
	feed = feedparser.parse(response.content)

	papers = [
	{
	"title": entry.title,
	"link": entry.link,
	"summary": entry.summary,
	"date": entry.published,
	"category": entry.arxiv_primary_category["term"]
	if "arxiv_primary_category" in entry
	else entry.tags[0]["term"],
	}
	for entry in feed.entries
	]
	return "/searchResults " + str(papers)

	downloadFunction = {
	"name": "download_pdf",
	"description": "download a pdf from a url",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "The url to download the pdf from.",
	},
	"filename": {
	"type": "string",
	"description": "The filename to save the pdf as. This should match ArXiv's file name.",
	},
	},
	"required": ["url"],
	},
	}

	def download_pdf(self, url: str, filename: str) -> str:
	"""
	Download a pdf from a url and save it in a topic categorized folder.

	:param url: The url to download the pdf from.
	:param topic: The research topic to categorize the pdf.
	:return: The path to the downloaded pdf.
	"""
	# Sanitize the topic string to create a valid directory name

	# Create the directory path for the topic
	topic_dir = os.path.join("src", "global_context", "research")
	os.makedirs(topic_dir, exist_ok=True)

	# Sanitize the filename string to create a valid filename make sure to include the .pdf extension
	sanitized_filename = (
	re.sub(r"[^\w\s-]", "", filename.replace(".pdf", ""))
	.strip()
	.lower()
	.replace(" ", "_")
	+ ".pdf"
	)

	# Create the full path for the pdf
	pdf_path = os.path.join(topic_dir, sanitized_filename)

	# Download and save the pdf
	response = requests.get(url)
	with open(pdf_path, "wb") as f:
	f.write(response.content)

	return pdf_path

	summarizeFunction = {
	"name": "summarize_paper",
	"description": "summarize a paper into a short paragraph",
	"parameters": {
	"type": "object",
	"properties": {
	"filename": {
	"type": "string",
	"description": "The path to the pdf to summarize.",
	},
	},
	"required": ["filename"],
	},
	}

	def summarize_paper(self, filename: str) -> str:
	self.ragUserProxy.initiate_chat(
	self.ragAgent,
	context=f"/summarize the paper {filename}",
	)
	return self.ragAgent.last_message()["content"]

	readPdfFunction = {
	"name": "read_pdf",
	"description": "read a pdf and extract the text",
	"parameters": {
	"type": "object",
	"properties": {
	"filename": {
	"type": "string",
	"description": "The filename of the pdf to read.",
	},
	},
	"required": ["filename"],
	},
	}

	def read_pdf(self, filename: str) -> str:
	# Load the structured notes database
	notes_db = load_structured_notes()

	# Check if the PDF has been read previously
	if filename in notes_db:
	# Return the structured notes if available
	return notes_db[filename]["notes"]

	# Read the PDF and generate structured notes
	file_dir = os.path.join("src", "global_context", "research", filename)
	structured_notes = retrieve_utils.extract_text_from_pdf(file_dir)

	# Save the structured notes and read status to the database
	notes_db[filename] = {"notes": structured_notes, "read": True}
	save_structured_notes(notes_db)

	return structured_notes