nikogamulin/bibliography_analyzer.ipynb

## bibliography_analyzer.ipynb
{
    "cells": [
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Standard library imports\n",
                "import os\n",
                "import re\n",
                "import textwrap\n",
                "from pathlib import Path\n",
                "import shutil\n",
                "\n",
                "# Environment variable handling\n",
                "from dotenv import find_dotenv, load_dotenv, dotenv_values\n",
                "\n",
                "# OpenAI related imports\n",
                "import openai\n",
                "from langchain.chains import LLMChain\n",
                "from langchain.vectorstores import FAISS\n",
                "from langchain.chat_models import ChatOpenAI\n",
                "from langchain.document_loaders import YoutubeLoader\n",
                "from langchain.embeddings.openai import OpenAIEmbeddings\n",
                "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
                "from langchain.prompts.chat import (\n",
                "    ChatPromptTemplate,\n",
                "    SystemMessagePromptTemplate, \n",
                "    HumanMessagePromptTemplate\n",
                ")\n",
                "\n",
                "# Autogen library (purpose not clear yet)\n",
                "import autogen\n",
                "\n",
                "# Additional embeddings\n",
                "from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceEmbeddings\n",
                "from sentence_transformers import SentenceTransformer\n",
                "\n",
                "# Numerical and clustering libraries\n",
                "import numpy as np\n",
                "from sklearn.cluster import KMeans\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "# Load environment variables from the specified .env file\n",
                "config = dotenv_values(\"../.env\")\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "# Initialize HuggingFace embeddings using a specified model\n",
                "embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-base')\n",
                "\n",
                "# Initialize the SentenceTransformer model using the same model name\n",
                "model = SentenceTransformer('thenlper/gte-base')\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "def parse_bib_file(bib_content):\n",
                "    \"\"\"Parse the content of a .bib file and extract articles.\n",
                "    \n",
                "    Parameters:\n",
                "    - bib_content (str): The content of the .bib file.\n",
                "    \n",
                "    Returns:\n",
                "    - list[dict]: A list of articles where each article is represented as a dictionary.\n",
                "    \"\"\"\n",
                "    # Split the content by \"@article\" to get individual articles\n",
                "    articles = re.split(r'(?=@article)', bib_content)\n",
                "    article_list = []\n",
                "    \n",
                "    for article in articles:\n",
                "        if not article.strip():\n",
                "            continue\n",
                "            \n",
                "        # Extract the unique identifier for the article\n",
                "        match = re.search(r'@article\\{(?P<id>.*?),', article)\n",
                "        if not match:\n",
                "            continue\n",
                "        article_id = match.group('id').strip()\n",
                "        \n",
                "        # Extract the fields within the article entry\n",
                "        fields = re.findall(r'(\\w+) = {(.*?)},', article, re.DOTALL)\n",
                "        article_dict = {'ID': article_id}\n",
                "        \n",
                "        for field, value in fields:\n",
                "            article_dict[field] = value.strip().replace('\\n', ' ')\n",
                "            \n",
                "        article_list.append(article_dict)\n",
                "        \n",
                "    return article_list\n",
                "\n",
                "def extract_articles_from_folder(folder_path, article_type=None):\n",
                "    \"\"\"Extract articles from multiple .bib files in a folder and optionally filter by article type.\n",
                "    \n",
                "    Parameters:\n",
                "    - folder_path (str): Path to the folder containing .bib files.\n",
                "    - article_type (str, optional): The type of article to filter by.\n",
                "    \n",
                "    Returns:\n",
                "    - list[dict]: A list of articles.\n",
                "    \"\"\"\n",
                "    all_articles = []\n",
                "    \n",
                "    # Iterate over all files in the folder\n",
                "    for filename in os.listdir(folder_path):\n",
                "        if filename.endswith(\".bib\"):\n",
                "            with open(os.path.join(folder_path, filename), \"r\") as file:\n",
                "                bib_content = file.read()\n",
                "                articles = parse_bib_file(bib_content)\n",
                "                all_articles.extend(articles)\n",
                "    \n",
                "    # Optionally filter by article type\n",
                "    if article_type:\n",
                "        all_articles = [article for article in all_articles if article.get('Type') == article_type]\n",
                "        \n",
                "    return all_articles"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "def parse_arbitrary_text(text, chunk_size=1000, chunk_overlap=200, field=None):\n",
                "    \"\"\"Parse and chunk arbitrary text based on specified size and overlap.\n",
                "    \n",
                "    Parameters:\n",
                "    - text (str): The text to be chunked.\n",
                "    - chunk_size (int): The size of each chunk.\n",
                "    - chunk_overlap (int): The overlap between chunks.\n",
                "    - field (str, optional): The field name to be used if the text is a dictionary.\n",
                "    \n",
                "    Returns:\n",
                "    - list[str]: A list of text chunks.\n",
                "    \"\"\"\n",
                "    if field and isinstance(text, dict):\n",
                "        text = text.get(field, \"\")\n",
                "    \n",
                "    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
                "    chunked_text = text_splitter.create_documents([text])\n",
                "    return chunked_text\n",
                "\n",
                "def get_list_of_chunks(articles_dict_list):\n",
                "    articles_filtered = list(filter(lambda x: 'Abstract' in x.keys(), articles_dict_list))\n",
                "    chunks_abstracts = list(map(lambda x: parse_arbitrary_text(x['Abstract']), articles_filtered))\n",
                "    return list(map(lambda x: list(map(lambda y: y.page_content, x)), chunks_abstracts))\n",
                "\n",
                "def get_abstract_embedding_with_mapping(bibliography_folder_path, article_type=None):\n",
                "    \"\"\"Get embeddings for abstracts from articles and maintain a mapping of indices.\n",
                "    \n",
                "    Parameters:\n",
                "    - bibliography_folder_path (str): Path to the folder containing .bib files.\n",
                "    \n",
                "    Returns:\n",
                "    - tuple: A tuple containing the chunked abstracts, their embeddings, and an index mapping.\n",
                "    \"\"\"\n",
                "    articles = extract_articles_from_folder(bibliography_folder_path)\n",
                "    \n",
                "    articles_abstracts = extract_articles_from_folder(bibliography_folder_path, article_type=article_type)\n",
                "    articles_chunked_abstracts = get_list_of_chunks(articles_abstracts)\n",
                "    \n",
                "    chunks_abstracts_embeddings = list(map(lambda x: model.encode(x), articles_chunked_abstracts))\n",
                "    flattened_chunks_abstracts_embeddings = [vec for sublist in chunks_abstracts_embeddings for vec in sublist]\n",
                "    \n",
                "    # Create a mapping of numpy array row index to original list index\n",
                "    index_mapping = [(i, j) for i, sublist in enumerate(chunks_abstracts_embeddings) for j, _ in enumerate(sublist)]\n",
                "    \n",
                "    return articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "def get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters):\n",
                "    \"\"\"Get a representative chunk for each cluster.\n",
                "    \n",
                "    Parameters:\n",
                "    - articles_chunked_abstracts (list[list[str]]): List of chunked abstracts.\n",
                "    - flattened_chunks_abstracts_embeddings (list[np.array]): Embeddings of the chunked abstracts.\n",
                "    - index_mapping (list[tuple[int, int]]): Mapping of indices from the flattened embeddings to original chunked abstracts.\n",
                "    - n_clusters (int): Number of clusters for KMeans clustering.\n",
                "    \n",
                "    Returns:\n",
                "    - tuple: A tuple containing the representative chunks for each cluster and the cluster centroids.\n",
                "    \"\"\"\n",
                "    # Run KMeans clustering on the embeddings\n",
                "    kmeans = KMeans(n_clusters=n_clusters, n_init=10)\n",
                "    kmeans.fit(flattened_chunks_abstracts_embeddings)\n",
                "    centroids = kmeans.cluster_centers_\n",
                "    \n",
                "    # Find the indices of the closest elements to each centroid\n",
                "    closest_point_indices = [np.argmin(np.linalg.norm(flattened_chunks_abstracts_embeddings - centroid, axis=1)) for centroid in centroids]\n",
                "    closest_chunks = []\n",
                "    for closest_point_index in closest_point_indices:\n",
                "        index_abstract, index_embedding = index_mapping[closest_point_index]\n",
                "        closest_chunks.append(articles_chunked_abstracts[index_abstract][index_embedding])\n",
                "    \n",
                "    return closest_chunks, centroids\n",
                "\n",
                "def perform_cluster_analysis(bibliography_folder_path, n_clusters=3):\n",
                "    \"\"\"Perform clustering on abstract embeddings and get cluster representations.\n",
                "    \n",
                "    Parameters:\n",
                "    - bibliography_folder_path (str): Path to the folder containing .bib files.\n",
                "    - n_clusters (int, optional): Number of clusters for KMeans clustering. Default is 3.\n",
                "    \n",
                "    Returns:\n",
                "    - tuple: A tuple containing the representative chunks for each cluster and the cluster centroids.\n",
                "    \"\"\"\n",
                "    articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping(bibliography_folder_path)\n",
                "    closest_chunks, centroids = get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters)\n",
                "    return closest_chunks, centroids\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "# Extract and embed abstracts from the provided bibliography folder\n",
                "articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping('bibliography_analyzer/bibliography')\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "metadata": {},
            "outputs": [],
            "source": [
                "config_list = [\n",
                "    {\n",
                "        'model': 'gpt-4',\n",
                "        'api_key': config['OPENAI_API_KEY']\n",
                "    }\n",
                "]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "metadata": {},
            "outputs": [],
            "source": [
                "llm_config={\n",
                "    \"request_timeout\": 600,\n",
                "    \"seed\": 44,  # change the seed for different trials\n",
                "    \"config_list\": config_list,\n",
                "    \"temperature\": 0,\n",
                "    \"functions\": [\n",
                "        {\n",
                "            \"name\": \"run_cluster_analysis\",\n",
                "            \"description\": \"Given the folder where .bib files are stored, the function performs the cluster analysis, returning the closest chunks and the centroids.\",\n",
                "            \"parameters\": {\n",
                "                \"type\": \"object\",\n",
                "                \"properties\": {\n",
                "                    \"bibliography_folder_path\": {\n",
                "                        \"type\": \"string\",\n",
                "                        \"description\": \"folder in which .bib files are stored\"\n",
                "                    },\n",
                "                    \"n_clusters\": {\n",
                "                        \"type\": \"integer\",\n",
                "                        \"description\": \"number of clusters to be created\"\n",
                "                    },\n",
                "                }\n",
                "            },\n",
                "            \"required\": [\"bibliography_folder_path\", \"n_clusters\"],\n",
                "        }\n",
                "    ]\n",
                "}"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 20,
            "metadata": {},
            "outputs": [],
            "source": [
                "article_writer_prompt = \"\"\"\n",
                "You are an article writer, embarking on a journey to demystify the intricate blend of agriculture, data analysis, and artificial intelligence for the general public. Begin by setting the stage. Highlight the evolving landscape of agriculture,\n",
                "where time-honored practices are now intertwining with modern technological\n",
                "advancements like data analysis and AI.\\n\\n\n",
                "    \n",
                "Delve into the primary discoveries or insights your analysis has unveiled.\n",
                "Whether it's identifying patterns, clusters, or trends, explain the significance\n",
                "of these findings and how they relate to agriculture's broader picture.\\n\\n\"\n",
                "    \n",
                "Dive deeper into the intricacies. Discuss the challenges and potential solutions\n",
                "unearthed by the analysis. Explore how these insights can impact various facets\n",
                "of farming, from soil health to yield to sustainability.\\n\\n\n",
                "    \n",
                "Discuss how the data offers a glimpse into the future. Are there emerging practices\n",
                "or innovations that could reshape agriculture? Delve into potential strategies and\n",
                "how they might cater to the evolving needs of sustainable farming in an ever-changing world.\\n\\n\n",
                "    \n",
                "Wrap up the narrative. Emphasize the invaluable role of data analysis, possibly enhanced\n",
                "\"by AI, in charting the future course of agriculture. Highlight how such insights empower\n",
                "farmers and stakeholders to make informed decisions that not only boost productivity but\n",
                "also safeguard the environment.\n",
                "When you receive all the necessary feedback from the agronomist and the SEO optimizer, add the word TERMINATE to the message.\n",
                "\"\"\"\n",
                "\n",
                "def check_for_termination(message):\n",
                "    # print(\"check_for_termination\", message)\n",
                "    content = message.get(\"content\")\n",
                "    if content != None and (\"TERMINATE\" in content):\n",
                "        print(\"should terminate\")\n",
                "        return True\n",
                "    return False\n",
                "\n",
                "# create an AssistantAgent instance named \"assistant\"\n",
                "data_analyst = autogen.AssistantAgent(\n",
                "    name=\"data_analyst\",\n",
                "    llm_config=llm_config,\n",
                "    system_message=\"\"\"You are data analyst and you are responsible to help solve tasks related to data analysis. \n",
                "    Only use the tools provided to run the analysis. \n",
                "    When you complete the analysis, add the word DATA_ANALYSIS_COMPLETED to the message.\n",
                "    \"\"\",\n",
                "    function_map={\n",
                "        \"run_cluster_analysis\": perform_cluster_analysis\n",
                "    },\n",
                "    code_execution_config=False\n",
                ")\n",
                "\n",
                "article_writer = autogen.AssistantAgent(\n",
                "    name=\"article_writer\",\n",
                "    llm_config=llm_config,\n",
                "    human_input_mode=\"NEVER\",\n",
                "    code_execution_config=False,\n",
                "    system_message=article_writer_prompt,\n",
                "    is_termination_msg=check_for_termination,\n",
                "    function_map={\n",
                "        \"run_cluster_analysis\": perform_cluster_analysis\n",
                "    },\n",
                ")\n",
                "\n",
                "agronomist = autogen.AssistantAgent(\n",
                "    name=\"agronomist\",\n",
                "    llm_config=llm_config,\n",
                "    human_input_mode=\"NEVER\",\n",
                "    code_execution_config=False,\n",
                "    system_message=\"You are an agronomist, working in the field of regenerative organic agriculture and you help the article writer suggest text modifications in order to improve the article quality and check facts. Ask the article writer to provide you with the text you need to optimize. When you complete the checks and you are satisfied with writer's final version, add the word AGRONOMY_EXPERT_CHECK_COMPLETED to the message.\",\n",
                "    is_termination_msg=check_for_termination,\n",
                "    function_map={\n",
                "        \"run_cluster_analysis\": perform_cluster_analysis\n",
                "    },\n",
                ")\n",
                "\n",
                "seo_optimizer = autogen.AssistantAgent(\n",
                "    name=\"seo_optimizer\",\n",
                "    llm_config=llm_config,\n",
                "    human_input_mode=\"NEVER\",\n",
                "    code_execution_config=False,\n",
                "    system_message=\"\"\"\n",
                "    SEO optimizer, you help the article writer suggest text modifications for search engine optimization. Ask the article writer to provide you with the text you need to optimize.\n",
                "    \"\"\",\n",
                "    is_termination_msg=check_for_termination,\n",
                "    function_map={\n",
                "        \"run_cluster_analysis\": perform_cluster_analysis\n",
                "    },\n",
                ")\n",
                "\n",
                "# create a UserProxyAgent instance named \"user_proxy\"\n",
                "user_proxy = autogen.UserProxyAgent(\n",
                "    name=\"user_proxy\",\n",
                "    human_input_mode=\"NEVER\",\n",
                "    is_termination_msg=check_for_termination,\n",
                "    max_consecutive_auto_reply=10,\n",
                "    function_map={\n",
                "        \"run_cluster_analysis\": perform_cluster_analysis\n",
                "    },\n",
                "    code_execution_config={\"work_dir\": \"bibliography_analyzer\"},\n",
                "    system_message=\"\"\"User, you are trying to determine the optimal number of centroids and return the text chunks.\n",
                "    When you respond with the status add the word TERMINATE\"\"\"\n",
                ")\n",
                "\n",
                "groupchat = autogen.GroupChat(agents=[user_proxy, data_analyst, agronomist, seo_optimizer, article_writer], messages=[], max_round=50)\n",
                "manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 21,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "\u001b[33muser_proxy\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\n",
                        "in folder 'bibliography_analyzer/bibliography', there are multiple .bib files. With the help of predefined function, get run k-means clustering and get the chunks of text representing the content closest to the centroids and the centroid vector representation. \n",
                        "run k-mean clustering multiple times and depending on centroid values, determine the optimal number of centroids by taking into account the following metric:\n",
                        "* The articles are from the field of agriculture, specifically related to crop rotation. Therefore, the clusters should represent different subconcepts related to crop rotation.\n",
                        "* The number of clusters should be as small as possible, but the clusters should be as different as possible.\n",
                        "After determining the optimal number of centroids along with conceptual representation, please, provide an explanation of your reasoning.\n",
                        "The article that has to be written is related to crop rotation and the importance of crop rotation has to be presented to wider audience. Also, it has to be explained, why the specific data analysis is important and support the claims with the analysis result.\n",
                        "\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Suggested function Call: run_cluster_analysis *****\u001b[0m\n",
                        "Arguments: \n",
                        "\n",
                        "{\n",
                        "\"bibliography_folder_path\": \"bibliography_analyzer/bibliography\",\n",
                        "\"n_clusters\": 2\n",
                        "}\n",
                        "\u001b[32m*********************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[35m\n",
                        ">>>>>>>> EXECUTING FUNCTION run_cluster_analysis...\u001b[0m\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Response from calling function \"run_cluster_analysis\" *****\u001b[0m\n",
                        "(['0-20 cm) during October-November, before planting. The rotations    significantly influenced mean SOM level, the order being fallow    (lowest), continuous wheat, lentil, chickpea, vetch, and medic    (highest). The mean effect of N was to increase SOM, but grazing    intensity tended to decrease SOM. While results from different aspects    of the trial published elsewhere demonstrated the value of legume-based    rotations as biologically and economically viable alternatives to fallow    or continuous cropping, this soil sampling SOM study showed that crop    production can be compatible with the goal of improving soil quality,    with potential environmental benefits. Thus, soil and crop management    practices involving appropriate rotations (legumes/cereals), adequate N    fertilization of the cereal crop, and retention of crop residues can    combine sustainable and economic cropping while reversing soil    degradation.', 'considered the    risks of yield losses and/or lower harvest quality plus harvesting    difficulties. In the medium term, they anticipated the risk of finding a    weed species in another crop of the rotation where control would be    difficult or costly, weighing the risks of yield loss against the cost    and effectiveness of solutions, not only in the current crop but also in    subsequent crops, so that once again, the rotation was the central focus    of weed control. In the long term, their main aim was to limit the soil    seed bank to an acceptable level. The farmers interviewed stated that    they would continue to implement a weed control programme that they    deemed satisfactory as long as no new problem appeared, and until they    could learn about more effective technical solutions. When designing a    DSS that will ensure successful, more sustainable weed management    practices, it is crucial to take account of both the complexity of the    decision-making process and'], array([[ 7.47992328e-05, -4.99498346e-03, -6.29644726e-03, ...,\n",
                        "         8.90509491e-03,  1.63265296e-02, -1.72438999e-02],\n",
                        "       [ 3.31143168e-03, -9.42959364e-03, -8.52153129e-03, ...,\n",
                        "        -4.74104248e-04,  1.25755643e-02, -1.77814166e-02]]))\n",
                        "\u001b[32m*****************************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Suggested function Call: run_cluster_analysis *****\u001b[0m\n",
                        "Arguments: \n",
                        "\n",
                        "{\n",
                        "\"bibliography_folder_path\": \"bibliography_analyzer/bibliography\",\n",
                        "\"n_clusters\": 3\n",
                        "}\n",
                        "\u001b[32m*********************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[35m\n",
                        ">>>>>>>> EXECUTING FUNCTION run_cluster_analysis...\u001b[0m\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Response from calling function \"run_cluster_analysis\" *****\u001b[0m\n",
                        "(['considered the    risks of yield losses and/or lower harvest quality plus harvesting    difficulties. In the medium term, they anticipated the risk of finding a    weed species in another crop of the rotation where control would be    difficult or costly, weighing the risks of yield loss against the cost    and effectiveness of solutions, not only in the current crop but also in    subsequent crops, so that once again, the rotation was the central focus    of weed control. In the long term, their main aim was to limit the soil    seed bank to an acceptable level. The farmers interviewed stated that    they would continue to implement a weed control programme that they    deemed satisfactory as long as no new problem appeared, and until they    could learn about more effective technical solutions. When designing a    DSS that will ensure successful, more sustainable weed management    practices, it is crucial to take account of both the complexity of the    decision-making process and', 'of water infiltration, content of carbon, nitrogen, water    extractable organic carbon (WEOC) and organic nitrogen (WEON), residue    cover, and biological activity were all increased by NT and CC practices    relative to STNO. However, effects varied by depth with NT increasing    soil bulk density by 12\\\\% in the 0-15 cm depth and 10\\\\% in the 15-30 cm    depth. Higher levels of WEOC were found in the CC surface (0-5 cm) depth    in both spring and fall samplings in 2014. Surface layer (0-15 cm) WEON    was higher in the CC systems for both samplings. Tillage did not affect    WEON in the spring, but WEON was increased in the NT surface soil layer    in the fall. Sampling depth, CC, and tillage affected 1-day soil    respiration and a soil health index assessment, however the effects were    seasonal, with higher levels found in the fall sampling than in the    spring. Both respiration and the soil health index were increased by CC    with higher levels found in the 0-5 cm depth than', 'to just 25\\\\% of that from a conventional tillage wheat-summer    fallow system. The primary challenge with reducing fallow frequency is    the increase in yield variability and risk of crop failure. Improved    approaches for choosing crop or fallow are being developed based on soil    water content and forecasted weather. Development of alternative crops,    crop rotations, and integrated livestock systems that are sustainable    from both economic and ecological perspectives is an on-going effort.    Other research is addressing adaptation of cropping practices to climate    change and the potential for dryland biomass crop production for the    developing biofuel industry. (C) 2012 Elsevier B.V. All rights reserved.'], array([[ 0.00231817, -0.00803987, -0.01023019, ...,  0.00117667,\n",
                        "         0.00869096, -0.01977357],\n",
                        "       [ 0.00047067, -0.00484245, -0.0070638 , ...,  0.01212926,\n",
                        "         0.01583167, -0.01762442],\n",
                        "       [ 0.0020795 , -0.00867201, -0.00499641, ..., -0.00155868,\n",
                        "         0.01833053, -0.01533302]]))\n",
                        "\u001b[32m*****************************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Suggested function Call: run_cluster_analysis *****\u001b[0m\n",
                        "Arguments: \n",
                        "\n",
                        "{\n",
                        "\"bibliography_folder_path\": \"bibliography_analyzer/bibliography\",\n",
                        "\"n_clusters\": 4\n",
                        "}\n",
                        "\u001b[32m*********************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[35m\n",
                        ">>>>>>>> EXECUTING FUNCTION run_cluster_analysis...\u001b[0m\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "\u001b[32m***** Response from calling function \"run_cluster_analysis\" *****\u001b[0m\n",
                        "(['0-20 cm) during October-November, before planting. The rotations    significantly influenced mean SOM level, the order being fallow    (lowest), continuous wheat, lentil, chickpea, vetch, and medic    (highest). The mean effect of N was to increase SOM, but grazing    intensity tended to decrease SOM. While results from different aspects    of the trial published elsewhere demonstrated the value of legume-based    rotations as biologically and economically viable alternatives to fallow    or continuous cropping, this soil sampling SOM study showed that crop    production can be compatible with the goal of improving soil quality,    with potential environmental benefits. Thus, soil and crop management    practices involving appropriate rotations (legumes/cereals), adequate N    fertilization of the cereal crop, and retention of crop residues can    combine sustainable and economic cropping while reversing soil    degradation.', 'considered the    risks of yield losses and/or lower harvest quality plus harvesting    difficulties. In the medium term, they anticipated the risk of finding a    weed species in another crop of the rotation where control would be    difficult or costly, weighing the risks of yield loss against the cost    and effectiveness of solutions, not only in the current crop but also in    subsequent crops, so that once again, the rotation was the central focus    of weed control. In the long term, their main aim was to limit the soil    seed bank to an acceptable level. The farmers interviewed stated that    they would continue to implement a weed control programme that they    deemed satisfactory as long as no new problem appeared, and until they    could learn about more effective technical solutions. When designing a    DSS that will ensure successful, more sustainable weed management    practices, it is crucial to take account of both the complexity of the    decision-making process and', 'enough to highlight    potential weed problems and solutions when comparing alternative crop    rotations for a Field. The model has been incorporated into a decision    Support tool for whole-farm management so growers in the Central Great    Plains of the United States can compare alternative crop rotations and    how their choice influences farm income, herbicide use, and control of    weeds in their fields.', 'of SOC. The soils were also physically fractionated by    density (light fraction, LF) and size (sand, silt and clay). With    improved management, the SOC storage in the clay size fraction showed    the largest increase across all fractions. This increase was greater for    MS than MM. The NTMS treatment resulted in a decline in silt-OC storage    compared to CTMM. The SOC mineralization (mg CO2-C g(-1)soil) was    affected by tillage and driven by LF-OC and was observed in the order:    NTMM (2.06) > MPMM (1.72) approximate to NTMS (1.71) > CTMM (1.52)    approximate to MPMS (1.41). Both cropping and depth affected the    biodegradability of SOC. Considering the plough layer (0-20 cm),    treatments under MM had a larger proportion of biodegradable SOC than    those under MS. We conclude that the significant differences in SOC    storage in physical fractions and SOC biodegradation were caused by    differences in soil management. Highlights    Clay size fraction showed the largest'], array([[-0.00154197, -0.00042275, -0.00559572, ...,  0.00338984,\n",
                        "         0.01605702, -0.01668113],\n",
                        "       [ 0.00198478, -0.00823907, -0.01047785, ...,  0.00093383,\n",
                        "         0.00845083, -0.01987246],\n",
                        "       [ 0.00399911, -0.01345275, -0.00542334, ..., -0.00234874,\n",
                        "         0.01730588, -0.01402178],\n",
                        "       [ 0.0020212 , -0.00706139, -0.0073139 , ...,  0.01493508,\n",
                        "         0.01707182, -0.01862683]]))\n",
                        "\u001b[32m*****************************************************************\u001b[0m\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[33mdata_analyst\u001b[0m (to chat_manager):\n",
                        "\n",
                        "After running the cluster analysis with different numbers of clusters, it seems that the optimal number of clusters is 3. The reasoning behind this is as follows:\n",
                        "\n",
                        "1. The first cluster seems to be about the impact of crop rotation on soil quality, specifically the role of legumes and cereals in improving soil organic matter (SOM) levels and reversing soil degradation.\n",
                        "\n",
                        "2. The second cluster appears to focus on weed control in crop rotation systems. It discusses the risks of yield losses and the importance of managing the soil seed bank.\n",
                        "\n",
                        "3. The third cluster seems to be about the challenges of reducing fallow frequency, the development of alternative crops and crop rotations, and the potential for dryland biomass crop production for the biofuel industry.\n",
                        "\n",
                        "These three clusters cover different aspects of crop rotation, providing a comprehensive overview of the topic. They are distinct enough to provide different perspectives, but not so many that the analysis becomes overly complex.\n",
                        "\n",
                        "In terms of the importance of this data analysis, it helps to identify the key themes in the literature on crop rotation. This can guide the writing of the article, ensuring that it covers the most important aspects of the topic. It also provides evidence to support the claims made in the article, demonstrating the importance of crop rotation in agriculture.\n",
                        "\n",
                        "DATA_ANALYSIS_COMPLETED\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "\u001b[33marticle_writer\u001b[0m (to chat_manager):\n",
                        "\n",
                        "The evolving landscape of agriculture is a fascinating blend of time-honored practices and cutting-edge technology. Traditional methods, such as crop rotation, are now being examined through the lens of data analysis and artificial intelligence (AI). This fusion of old and new is transforming the way we understand and approach farming.\n",
                        "\n",
                        "Our analysis of the literature on crop rotation has revealed three key themes. First, crop rotation, particularly involving legumes and cereals, plays a crucial role in improving soil quality. By enhancing soil organic matter levels, these rotations can reverse soil degradation, leading to more sustainable farming practices.\n",
                        "\n",
                        "Second, weed control is a significant concern in crop rotation systems. Farmers must balance the risks of yield losses against the cost and effectiveness of weed control solutions. This complex decision-making process is central to the success of crop rotation.\n",
                        "\n",
                        "Third, reducing fallow frequency presents a challenge. While it can lead to increased yield variability and risk of crop failure, it also opens up opportunities for alternative crops and crop rotations. There's even potential for dryland biomass crop production for the biofuel industry.\n",
                        "\n",
                        "These insights highlight the complexity of crop rotation and the challenges farmers face. However, they also point to potential solutions. For instance, data analysis could help farmers make more informed decisions about weed control, or AI could be used to predict the impact of reducing fallow frequency.\n",
                        "\n",
                        "Looking to the future, data analysis and AI could play a pivotal role in reshaping agriculture. They could help develop new crop rotation strategies that balance productivity with sustainability. They could also aid in the creation of innovative farming practices that cater to the evolving needs of our world.\n",
                        "\n",
                        "In conclusion, data analysis, enhanced by AI, is invaluable in charting the future course of agriculture. It empowers farmers and stakeholders to make informed decisions that boost productivity and safeguard the environment. By demystifying the intricate blend of agriculture, data analysis, and AI, we can better appreciate the potential of these technologies in transforming farming practices.\n",
                        "\n",
                        "TERMINATE\n",
                        "\n",
                        "--------------------------------------------------------------------------------\n",
                        "should terminate\n"
                    ]
                }
            ],
            "source": [
                "task = \"\"\"\n",
                "in folder 'bibliography_analyzer/bibliography', there are multiple .bib files. With the help of predefined function, get run k-means clustering and get the chunks of text representing the content closest to the centroids and the centroid vector representation. \n",
                "run k-mean clustering multiple times and depending on centroid values, determine the optimal number of centroids by taking into account the following metric:\n",
                "* The articles are from the field of agriculture, specifically related to crop rotation. Therefore, the clusters should represent different subconcepts related to crop rotation.\n",
                "* The number of clusters should be as small as possible, but the clusters should be as different as possible.\n",
                "After determining the optimal number of centroids along with conceptual representation, please, provide an explanation of your reasoning.\n",
                "The article that has to be written is related to crop rotation and the importance of crop rotation has to be presented to wider audience. Also, it has to be explained, why the specific data analysis is important and support the claims with the analysis result.\n",
                "\"\"\"\n",
                "\n",
                "# user_proxy.initiate_chat(assistant, message=task)\n",
                "\n",
                "user_proxy.initiate_chat(\n",
                "    manager,\n",
                "    message=task,\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.10.13"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}